howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26
   27from howard.functions.commons import *
   28from howard.objects.database import *
   29from howard.functions.databases import *
   30from howard.functions.utils import *
   31
   32
   33class Variants:
   34
   35    def __init__(
   36        self,
   37        conn=None,
   38        input: str = None,
   39        output: str = None,
   40        config: dict = {},
   41        param: dict = {},
   42        load: bool = False,
   43    ) -> None:
   44        """
   45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   46        header
   47
   48        :param conn: the connection to the database
   49        :param input: the input file
   50        :param output: the output file
   51        :param config: a dictionary containing the configuration of the model
   52        :param param: a dictionary containing the parameters of the model
   53        """
   54
   55        # Init variables
   56        self.init_variables()
   57
   58        # Input
   59        self.set_input(input)
   60
   61        # Config
   62        self.set_config(config)
   63
   64        # Param
   65        self.set_param(param)
   66
   67        # Output
   68        self.set_output(output)
   69
   70        # connexion
   71        self.set_connexion(conn)
   72
   73        # Header
   74        self.set_header()
   75
   76        # Samples
   77        self.set_samples()
   78
   79        # Load data
   80        if load:
   81            self.load_data()
   82
   83    def set_samples(self, samples: list = None) -> list:
   84        """
   85        The function `set_samples` sets the samples attribute of an object to a provided list or
   86        retrieves it from a parameter dictionary.
   87
   88        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   89        input and sets the `samples` attribute of the class to the provided list. If no samples are
   90        provided, it tries to get the samples from the class's parameters using the `get_param` method
   91        :type samples: list
   92        :return: The `samples` list is being returned.
   93        """
   94
   95        if not samples:
   96            samples = self.get_param().get("samples", {}).get("list", None)
   97
   98        self.samples = samples
   99
  100        return samples
  101
  102    def get_samples(self) -> list:
  103        """
  104        This function returns a list of samples.
  105        :return: The `get_samples` method is returning the `samples` attribute of the object.
  106        """
  107
  108        return self.samples
  109
  110    def get_samples_check(self) -> bool:
  111        """
  112        This function returns the value of the "check" key within the "samples" dictionary retrieved
  113        from the parameters.
  114        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  115        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  116        method. If the key "check" is not found, it will return `False`.
  117        """
  118
  119        return self.get_param().get("samples", {}).get("check", True)
  120
  121    def set_input(self, input: str = None) -> None:
  122        """
  123        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  124        attributes in the class accordingly.
  125
  126        :param input: The `set_input` method in the provided code snippet is used to set attributes
  127        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  128        :type input: str
  129        """
  130
  131        if input and not isinstance(input, str):
  132            try:
  133                self.input = input.name
  134            except:
  135                log.error(f"Input file '{input} in bad format")
  136                raise ValueError(f"Input file '{input} in bad format")
  137        else:
  138            self.input = input
  139
  140        # Input format
  141        if input:
  142            input_name, input_extension = os.path.splitext(self.input)
  143            self.input_name = input_name
  144            self.input_extension = input_extension
  145            self.input_format = self.input_extension.replace(".", "")
  146
  147    def set_config(self, config: dict) -> None:
  148        """
  149        The set_config function takes a config object and assigns it as the configuration object for the
  150        class.
  151
  152        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  153        contains configuration settings for the class. When you call the `set_config` function with a
  154        dictionary object as the argument, it will set that dictionary as the configuration object for
  155        the class
  156        :type config: dict
  157        """
  158
  159        self.config = config
  160
  161    def set_param(self, param: dict) -> None:
  162        """
  163        This function sets a parameter object for the class based on the input dictionary.
  164
  165        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  166        as the `param` attribute of the class instance
  167        :type param: dict
  168        """
  169
  170        self.param = param
  171
  172    def init_variables(self) -> None:
  173        """
  174        This function initializes the variables that will be used in the rest of the class
  175        """
  176
  177        self.prefix = "howard"
  178        self.table_variants = "variants"
  179        self.dataframe = None
  180
  181        self.comparison_map = {
  182            "gt": ">",
  183            "gte": ">=",
  184            "lt": "<",
  185            "lte": "<=",
  186            "equals": "=",
  187            "contains": "SIMILAR TO",
  188        }
  189
  190        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  191
  192        self.code_type_map_to_sql = {
  193            "Integer": "INTEGER",
  194            "String": "VARCHAR",
  195            "Float": "FLOAT",
  196            "Flag": "VARCHAR",
  197        }
  198
  199        self.index_additionnal_fields = []
  200
  201    def get_indexing(self) -> bool:
  202        """
  203        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  204        returns False.
  205        :return: The value of the indexing parameter.
  206        """
  207
  208        return self.get_param().get("indexing", False)
  209
  210    def get_connexion_config(self) -> dict:
  211        """
  212        The function `get_connexion_config` returns a dictionary containing the configuration for a
  213        connection, including the number of threads and memory limit.
  214        :return: a dictionary containing the configuration for the Connexion library.
  215        """
  216
  217        # config
  218        config = self.get_config()
  219
  220        # Connexion config
  221        connexion_config = {}
  222        threads = self.get_threads()
  223
  224        # Threads
  225        if threads:
  226            connexion_config["threads"] = threads
  227
  228        # Memory
  229        # if config.get("memory", None):
  230        #     connexion_config["memory_limit"] = config.get("memory")
  231        if self.get_memory():
  232            connexion_config["memory_limit"] = self.get_memory()
  233
  234        # Temporary directory
  235        if config.get("tmp", None):
  236            connexion_config["temp_directory"] = config.get("tmp")
  237
  238        # Access
  239        if config.get("access", None):
  240            access = config.get("access")
  241            if access in ["RO"]:
  242                access = "READ_ONLY"
  243            elif access in ["RW"]:
  244                access = "READ_WRITE"
  245            connexion_db = self.get_connexion_db()
  246            if connexion_db in ":memory:":
  247                access = "READ_WRITE"
  248            connexion_config["access_mode"] = access
  249
  250        return connexion_config
  251
  252    def get_duckdb_settings(self) -> dict:
  253        """
  254        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  255        string.
  256        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  257        """
  258
  259        # config
  260        config = self.get_config()
  261
  262        # duckdb settings
  263        duckdb_settings_dict = {}
  264        if config.get("duckdb_settings", None):
  265            duckdb_settings = config.get("duckdb_settings")
  266            duckdb_settings = full_path(duckdb_settings)
  267            # duckdb setting is a file
  268            if os.path.exists(duckdb_settings):
  269                with open(duckdb_settings) as json_file:
  270                    duckdb_settings_dict = yaml.safe_load(json_file)
  271            # duckdb settings is a string
  272            else:
  273                duckdb_settings_dict = json.loads(duckdb_settings)
  274
  275        return duckdb_settings_dict
  276
  277    def set_connexion_db(self) -> str:
  278        """
  279        The function `set_connexion_db` returns the appropriate database connection string based on the
  280        input format and connection type.
  281        :return: the value of the variable `connexion_db`.
  282        """
  283
  284        # Default connexion db
  285        default_connexion_db = ":memory:"
  286
  287        # Find connexion db
  288        if self.get_input_format() in ["db", "duckdb"]:
  289            connexion_db = self.get_input()
  290        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  291            connexion_db = default_connexion_db
  292        elif self.get_connexion_type() in ["tmpfile"]:
  293            tmp_name = tempfile.mkdtemp(
  294                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  295            )
  296            connexion_db = f"{tmp_name}/tmp.db"
  297        elif self.get_connexion_type() != "":
  298            connexion_db = self.get_connexion_type()
  299        else:
  300            connexion_db = default_connexion_db
  301
  302        # Set connexion db
  303        self.connexion_db = connexion_db
  304
  305        return connexion_db
  306
  307    def set_connexion(self, conn) -> None:
  308        """
  309        The function `set_connexion` creates a connection to a database, with options for different
  310        database formats and settings.
  311
  312        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  313        database. If a connection is not provided, a new connection to an in-memory database is created.
  314        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  315        sqlite
  316        """
  317
  318        # Connexion db
  319        connexion_db = self.set_connexion_db()
  320
  321        # Connexion config
  322        connexion_config = self.get_connexion_config()
  323
  324        # Connexion format
  325        connexion_format = self.get_config().get("connexion_format", "duckdb")
  326        # Set connexion format
  327        self.connexion_format = connexion_format
  328
  329        # Connexion
  330        if not conn:
  331            if connexion_format in ["duckdb"]:
  332                conn = duckdb.connect(connexion_db, config=connexion_config)
  333                # duckDB settings
  334                duckdb_settings = self.get_duckdb_settings()
  335                if duckdb_settings:
  336                    for setting in duckdb_settings:
  337                        setting_value = duckdb_settings.get(setting)
  338                        if isinstance(setting_value, str):
  339                            setting_value = f"'{setting_value}'"
  340                        conn.execute(f"PRAGMA {setting}={setting_value};")
  341            elif connexion_format in ["sqlite"]:
  342                conn = sqlite3.connect(connexion_db)
  343
  344        # Set connexion
  345        self.conn = conn
  346
  347        # Log
  348        log.debug(f"connexion_format: {connexion_format}")
  349        log.debug(f"connexion_db: {connexion_db}")
  350        log.debug(f"connexion config: {connexion_config}")
  351        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  352
  353    def set_output(self, output: str = None) -> None:
  354        """
  355        The `set_output` function in Python sets the output file based on the input or a specified key
  356        in the config file, extracting the output name, extension, and format.
  357
  358        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  359        the output file. If the config file has an 'output' key, the method sets the output to the value
  360        of that key. If no output is provided, it sets the output to `None`
  361        :type output: str
  362        """
  363
  364        if output and not isinstance(output, str):
  365            self.output = output.name
  366        else:
  367            self.output = output
  368
  369        # Output format
  370        if self.output:
  371            output_name, output_extension = os.path.splitext(self.output)
  372            self.output_name = output_name
  373            self.output_extension = output_extension
  374            self.output_format = self.output_extension.replace(".", "")
  375        else:
  376            self.output_name = None
  377            self.output_extension = None
  378            self.output_format = None
  379
  380    def set_header(self) -> None:
  381        """
  382        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  383        """
  384
  385        input_file = self.get_input()
  386        default_header_list = [
  387            "##fileformat=VCFv4.2",
  388            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  389        ]
  390
  391        # Full path
  392        input_file = full_path(input_file)
  393
  394        if input_file:
  395
  396            input_format = self.get_input_format()
  397            input_compressed = self.get_input_compressed()
  398            config = self.get_config()
  399            header_list = default_header_list
  400            if input_format in [
  401                "vcf",
  402                "hdr",
  403                "tsv",
  404                "csv",
  405                "psv",
  406                "parquet",
  407                "db",
  408                "duckdb",
  409            ]:
  410                # header provided in param
  411                if config.get("header_file", None):
  412                    with open(config.get("header_file"), "rt") as f:
  413                        header_list = self.read_vcf_header(f)
  414                # within a vcf file format (header within input file itsself)
  415                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  416                    # within a compressed vcf file format (.vcf.gz)
  417                    if input_compressed:
  418                        with bgzf.open(input_file, "rt") as f:
  419                            header_list = self.read_vcf_header(f)
  420                    # within an uncompressed vcf file format (.vcf)
  421                    else:
  422                        with open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                # header provided in default external file .hdr
  425                elif os.path.exists((input_file + ".hdr")):
  426                    with open(input_file + ".hdr", "rt") as f:
  427                        header_list = self.read_vcf_header(f)
  428                else:
  429                    try:  # Try to get header info fields and file columns
  430
  431                        with tempfile.TemporaryDirectory() as tmpdir:
  432
  433                            # Create database
  434                            db_for_header = Database(database=input_file)
  435
  436                            # Get header columns for infos fields
  437                            db_header_from_columns = (
  438                                db_for_header.get_header_from_columns()
  439                            )
  440
  441                            # Get real columns in the file
  442                            db_header_columns = db_for_header.get_columns()
  443
  444                            # Write header file
  445                            header_file_tmp = os.path.join(tmpdir, "header")
  446                            f = open(header_file_tmp, "w")
  447                            vcf.Writer(f, db_header_from_columns)
  448                            f.close()
  449
  450                            # Replace #CHROM line with rel columns
  451                            header_list = db_for_header.read_header_file(
  452                                header_file=header_file_tmp
  453                            )
  454                            header_list[-1] = "\t".join(db_header_columns)
  455
  456                    except:
  457
  458                        log.warning(
  459                            f"No header for file {input_file}. Set as default VCF header"
  460                        )
  461                        header_list = default_header_list
  462
  463            else:  # try for unknown format ?
  464
  465                log.error(f"Input file format '{input_format}' not available")
  466                raise ValueError(f"Input file format '{input_format}' not available")
  467
  468            if not header_list:
  469                header_list = default_header_list
  470
  471            # header as list
  472            self.header_list = header_list
  473
  474            # header as VCF object
  475            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  476
  477        else:
  478
  479            self.header_list = None
  480            self.header_vcf = None
  481
  482    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  483        """
  484        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  485        DataFrame based on the connection format.
  486
  487        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  488        represents the SQL query you want to execute. This query will be used to fetch data from a
  489        database and convert it into a pandas DataFrame
  490        :type query: str
  491        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  492        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  493        function will only fetch up to that number of rows from the database query result. If no limit
  494        is specified,
  495        :type limit: int
  496        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  497        """
  498
  499        # Connexion format
  500        connexion_format = self.get_connexion_format()
  501
  502        # Limit in query
  503        if limit:
  504            pd.set_option("display.max_rows", limit)
  505            if connexion_format in ["duckdb"]:
  506                df = (
  507                    self.conn.execute(query)
  508                    .fetch_record_batch(limit)
  509                    .read_next_batch()
  510                    .to_pandas()
  511                )
  512            elif connexion_format in ["sqlite"]:
  513                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  514
  515        # Full query
  516        else:
  517            if connexion_format in ["duckdb"]:
  518                df = self.conn.execute(query).df()
  519            elif connexion_format in ["sqlite"]:
  520                df = pd.read_sql_query(query, self.conn)
  521
  522        return df
  523
  524    def get_overview(self) -> None:
  525        """
  526        The function prints the input, output, config, and dataframe of the current object
  527        """
  528        table_variants_from = self.get_table_variants(clause="from")
  529        sql_columns = self.get_header_columns_as_sql()
  530        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  531        df = self.get_query_to_df(sql_query_export)
  532        log.info(
  533            "Input:  "
  534            + str(self.get_input())
  535            + " ["
  536            + str(str(self.get_input_format()))
  537            + "]"
  538        )
  539        log.info(
  540            "Output: "
  541            + str(self.get_output())
  542            + " ["
  543            + str(str(self.get_output_format()))
  544            + "]"
  545        )
  546        log.info("Config: ")
  547        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  548            "\n"
  549        ):
  550            log.info("\t" + str(d))
  551        log.info("Param: ")
  552        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  553            "\n"
  554        ):
  555            log.info("\t" + str(d))
  556        log.info("Sample list: " + str(self.get_header_sample_list()))
  557        log.info("Dataframe: ")
  558        for d in str(df).split("\n"):
  559            log.info("\t" + str(d))
  560
  561        # garbage collector
  562        del df
  563        gc.collect()
  564
  565        return None
  566
  567    def get_stats(self) -> dict:
  568        """
  569        The `get_stats` function calculates and returns various statistics of the current object,
  570        including information about the input file, variants, samples, header fields, quality, and
  571        SNVs/InDels.
  572        :return: a dictionary containing various statistics of the current object. The dictionary has
  573        the following structure:
  574        """
  575
  576        # Log
  577        log.info(f"Stats Calculation...")
  578
  579        # table varaints
  580        table_variants_from = self.get_table_variants()
  581
  582        # stats dict
  583        stats = {"Infos": {}}
  584
  585        ### File
  586        input_file = self.get_input()
  587        stats["Infos"]["Input file"] = input_file
  588
  589        # Header
  590        header_infos = self.get_header().infos
  591        header_formats = self.get_header().formats
  592        header_infos_list = list(header_infos)
  593        header_formats_list = list(header_formats)
  594
  595        ### Variants
  596
  597        stats["Variants"] = {}
  598
  599        # Variants by chr
  600        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  601        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  602        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  603            by=["CHROM"], kind="quicksort"
  604        )
  605
  606        # Total number of variants
  607        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  608
  609        # Calculate percentage
  610        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  611            lambda x: (x / nb_of_variants)
  612        )
  613
  614        stats["Variants"]["Number of variants by chromosome"] = (
  615            nb_of_variants_by_chrom.to_dict(orient="index")
  616        )
  617
  618        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  619
  620        ### Samples
  621
  622        # Init
  623        samples = {}
  624        nb_of_samples = 0
  625
  626        # Check Samples
  627        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  628            log.debug(f"Check samples...")
  629            for sample in self.get_header_sample_list():
  630                sql_query_samples = f"""
  631                    SELECT  '{sample}' as sample,
  632                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  633                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  634                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  635                    FROM {table_variants_from}
  636                    WHERE (
  637                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  638                        AND
  639                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  640                      )
  641                    GROUP BY genotype
  642                    """
  643                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  644                sample_genotype_count = sql_query_genotype_df["count"].sum()
  645                if len(sql_query_genotype_df):
  646                    nb_of_samples += 1
  647                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  648                        sql_query_genotype_df.to_dict(orient="index")
  649                    )
  650
  651            stats["Samples"] = samples
  652            stats["Infos"]["Number of samples"] = nb_of_samples
  653
  654        # #
  655        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  656        #     stats["Infos"]["Number of samples"] = nb_of_samples
  657        # elif nb_of_samples:
  658        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  659
  660        ### INFO and FORMAT fields
  661        header_types_df = {}
  662        header_types_list = {
  663            "List of INFO fields": header_infos,
  664            "List of FORMAT fields": header_formats,
  665        }
  666        i = 0
  667        for header_type in header_types_list:
  668
  669            header_type_infos = header_types_list.get(header_type)
  670            header_infos_dict = {}
  671
  672            for info in header_type_infos:
  673
  674                i += 1
  675                header_infos_dict[i] = {}
  676
  677                # ID
  678                header_infos_dict[i]["id"] = info
  679
  680                # num
  681                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  682                if header_type_infos[info].num in genotype_map.keys():
  683                    header_infos_dict[i]["Number"] = genotype_map.get(
  684                        header_type_infos[info].num
  685                    )
  686                else:
  687                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  688
  689                # type
  690                if header_type_infos[info].type:
  691                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  692                else:
  693                    header_infos_dict[i]["Type"] = "."
  694
  695                # desc
  696                if header_type_infos[info].desc != None:
  697                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  698                else:
  699                    header_infos_dict[i]["Description"] = ""
  700
  701            if len(header_infos_dict):
  702                header_types_df[header_type] = pd.DataFrame.from_dict(
  703                    header_infos_dict, orient="index"
  704                ).to_dict(orient="index")
  705
  706        # Stats
  707        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  708        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  709        stats["Header"] = header_types_df
  710
  711        ### QUAL
  712        if "QUAL" in self.get_header_columns():
  713            sql_query_qual = f"""
  714                    SELECT
  715                        avg(CAST(QUAL AS INTEGER)) AS Average,
  716                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  717                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  718                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  719                        median(CAST(QUAL AS INTEGER)) AS Median,
  720                        variance(CAST(QUAL AS INTEGER)) AS Variance
  721                    FROM {table_variants_from}
  722                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  723                    """
  724
  725            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  726            stats["Quality"] = {"Stats": qual}
  727
  728        ### SNV and InDel
  729
  730        sql_query_snv = f"""
  731            
  732            SELECT Type, count FROM (
  733
  734                    SELECT
  735                        'Total' AS Type,
  736                        count(*) AS count
  737                    FROM {table_variants_from}
  738
  739                    UNION
  740
  741                    SELECT
  742                        'MNV' AS Type,
  743                        count(*) AS count
  744                    FROM {table_variants_from}
  745                    WHERE len(REF) > 1 AND len(ALT) > 1
  746                    AND len(REF) = len(ALT)
  747
  748                    UNION
  749
  750                    SELECT
  751                        'InDel' AS Type,
  752                        count(*) AS count
  753                    FROM {table_variants_from}
  754                    WHERE len(REF) > 1 OR len(ALT) > 1
  755                    AND len(REF) != len(ALT)
  756                    
  757                    UNION
  758
  759                    SELECT
  760                        'SNV' AS Type,
  761                        count(*) AS count
  762                    FROM {table_variants_from}
  763                    WHERE len(REF) = 1 AND len(ALT) = 1
  764
  765                )
  766
  767            ORDER BY count DESC
  768
  769                """
  770        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  771
  772        sql_query_snv_substitution = f"""
  773                SELECT
  774                    concat(REF, '>', ALT) AS 'Substitution',
  775                    count(*) AS count
  776                FROM {table_variants_from}
  777                WHERE len(REF) = 1 AND len(ALT) = 1
  778                GROUP BY REF, ALT
  779                ORDER BY count(*) DESC
  780                """
  781        snv_substitution = (
  782            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  783        )
  784        stats["Variants"]["Counts"] = snv_indel
  785        stats["Variants"]["Substitutions"] = snv_substitution
  786
  787        return stats
  788
  789    def stats_to_file(self, file: str = None) -> str:
  790        """
  791        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  792        into a JSON object, and writes the JSON object to the specified file.
  793
  794        :param file: The `file` parameter is a string that represents the file path where the JSON data
  795        will be written
  796        :type file: str
  797        :return: the name of the file that was written to.
  798        """
  799
  800        # Get stats
  801        stats = self.get_stats()
  802
  803        # Serializing json
  804        json_object = json.dumps(stats, indent=4)
  805
  806        # Writing to sample.json
  807        with open(file, "w") as outfile:
  808            outfile.write(json_object)
  809
  810        return file
  811
  812    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  813        """
  814        The `print_stats` function generates a markdown file and prints the statistics contained in a
  815        JSON file in a formatted manner.
  816
  817        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  818        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  819        provided, a temporary directory will be created and the stats will be saved in a file named
  820        "stats.md" within that
  821        :type output_file: str
  822        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  823        file where the statistics will be saved. If no value is provided, a temporary directory will be
  824        created and a default file name "stats.json" will be used
  825        :type json_file: str
  826        :return: The function `print_stats` does not return any value. It has a return type annotation
  827        of `None`.
  828        """
  829
  830        # Full path
  831        output_file = full_path(output_file)
  832        json_file = full_path(json_file)
  833
  834        with tempfile.TemporaryDirectory() as tmpdir:
  835
  836            # Files
  837            if not output_file:
  838                output_file = os.path.join(tmpdir, "stats.md")
  839            if not json_file:
  840                json_file = os.path.join(tmpdir, "stats.json")
  841
  842            # Create folders
  843            if not os.path.exists(os.path.dirname(output_file)):
  844                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  845            if not os.path.exists(os.path.dirname(json_file)):
  846                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  847
  848            # Create stats JSON file
  849            stats_file = self.stats_to_file(file=json_file)
  850
  851            # Print stats file
  852            with open(stats_file) as f:
  853                stats = yaml.safe_load(f)
  854
  855            # Output
  856            output_title = []
  857            output_index = []
  858            output = []
  859
  860            # Title
  861            output_title.append("# HOWARD Stats")
  862
  863            # Index
  864            output_index.append("## Index")
  865
  866            # Process sections
  867            for section in stats:
  868                infos = stats.get(section)
  869                section_link = "#" + section.lower().replace(" ", "-")
  870                output.append(f"## {section}")
  871                output_index.append(f"- [{section}]({section_link})")
  872
  873                if len(infos):
  874                    for info in infos:
  875                        try:
  876                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  877                            is_df = True
  878                        except:
  879                            try:
  880                                df = pd.DataFrame.from_dict(
  881                                    json.loads((infos.get(info))), orient="index"
  882                                )
  883                                is_df = True
  884                            except:
  885                                is_df = False
  886                        if is_df:
  887                            output.append(f"### {info}")
  888                            info_link = "#" + info.lower().replace(" ", "-")
  889                            output_index.append(f"   - [{info}]({info_link})")
  890                            output.append(f"{df.to_markdown(index=False)}")
  891                        else:
  892                            output.append(f"- {info}: {infos.get(info)}")
  893                else:
  894                    output.append(f"NA")
  895
  896            # Write stats in markdown file
  897            with open(output_file, "w") as fp:
  898                for item in output_title:
  899                    fp.write("%s\n" % item)
  900                for item in output_index:
  901                    fp.write("%s\n" % item)
  902                for item in output:
  903                    fp.write("%s\n" % item)
  904
  905            # Output stats in markdown
  906            print("")
  907            print("\n\n".join(output_title))
  908            print("")
  909            print("\n\n".join(output))
  910            print("")
  911
  912        return None
  913
  914    def get_input(self) -> str:
  915        """
  916        It returns the value of the input variable.
  917        :return: The input is being returned.
  918        """
  919        return self.input
  920
  921    def get_input_format(self, input_file: str = None) -> str:
  922        """
  923        This function returns the format of the input variable, either from the provided input file or
  924        by prompting for input.
  925
  926        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  927        represents the file path of the input file. If no `input_file` is provided when calling the
  928        method, it will default to `None`
  929        :type input_file: str
  930        :return: The format of the input variable is being returned.
  931        """
  932
  933        if not input_file:
  934            input_file = self.get_input()
  935        input_format = get_file_format(input_file)
  936        return input_format
  937
  938    def get_input_compressed(self, input_file: str = None) -> str:
  939        """
  940        The function `get_input_compressed` returns the format of the input variable after compressing
  941        it.
  942
  943        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  944        that represents the file path of the input file. If no `input_file` is provided when calling the
  945        method, it will default to `None` and the method will then call `self.get_input()` to
  946        :type input_file: str
  947        :return: The function `get_input_compressed` returns the compressed format of the input
  948        variable.
  949        """
  950
  951        if not input_file:
  952            input_file = self.get_input()
  953        input_compressed = get_file_compressed(input_file)
  954        return input_compressed
  955
  956    def get_output(self) -> str:
  957        """
  958        It returns the output of the neuron.
  959        :return: The output of the neural network.
  960        """
  961
  962        return self.output
  963
  964    def get_output_format(self, output_file: str = None) -> str:
  965        """
  966        The function `get_output_format` returns the format of the input variable or the output file if
  967        provided.
  968
  969        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  970        that represents the file path of the output file. If no `output_file` is provided when calling
  971        the method, it will default to the output obtained from the `get_output` method of the class
  972        instance. The
  973        :type output_file: str
  974        :return: The format of the input variable is being returned.
  975        """
  976
  977        if not output_file:
  978            output_file = self.get_output()
  979        output_format = get_file_format(output_file)
  980
  981        return output_format
  982
  983    def get_config(self) -> dict:
  984        """
  985        It returns the config
  986        :return: The config variable is being returned.
  987        """
  988        return self.config
  989
  990    def get_param(self) -> dict:
  991        """
  992        It returns the param
  993        :return: The param variable is being returned.
  994        """
  995        return self.param
  996
  997    def get_connexion_db(self) -> str:
  998        """
  999        It returns the connexion_db attribute of the object
 1000        :return: The connexion_db is being returned.
 1001        """
 1002        return self.connexion_db
 1003
 1004    def get_prefix(self) -> str:
 1005        """
 1006        It returns the prefix of the object.
 1007        :return: The prefix is being returned.
 1008        """
 1009        return self.prefix
 1010
 1011    def get_table_variants(self, clause: str = "select") -> str:
 1012        """
 1013        This function returns the table_variants attribute of the object
 1014
 1015        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1016        defaults to select (optional)
 1017        :return: The table_variants attribute of the object.
 1018        """
 1019
 1020        # Access
 1021        access = self.get_config().get("access", None)
 1022
 1023        # Clauses "select", "where", "update"
 1024        if clause in ["select", "where", "update"]:
 1025            table_variants = self.table_variants
 1026        # Clause "from"
 1027        elif clause in ["from"]:
 1028            # For Read Only
 1029            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1030                input_file = self.get_input()
 1031                table_variants = f"'{input_file}' as variants"
 1032            # For Read Write
 1033            else:
 1034                table_variants = f"{self.table_variants} as variants"
 1035        else:
 1036            table_variants = self.table_variants
 1037        return table_variants
 1038
 1039    def get_tmp_dir(self) -> str:
 1040        """
 1041        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1042        parameters or a default path.
 1043        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1044        configuration, parameters, and a default value of "/tmp".
 1045        """
 1046
 1047        return get_tmp(
 1048            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1049        )
 1050
 1051    def get_connexion_type(self) -> str:
 1052        """
 1053        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1054
 1055        :return: The connexion type is being returned.
 1056        """
 1057        return self.get_config().get("connexion_type", "memory")
 1058
 1059    def get_connexion(self):
 1060        """
 1061        It returns the connection object
 1062
 1063        :return: The connection object.
 1064        """
 1065        return self.conn
 1066
 1067    def close_connexion(self) -> None:
 1068        """
 1069        This function closes the connection to the database.
 1070        :return: The connection is being closed.
 1071        """
 1072        return self.conn.close()
 1073
 1074    def get_header(self, type: str = "vcf"):
 1075        """
 1076        This function returns the header of the VCF file as a list of strings
 1077
 1078        :param type: the type of header you want to get, defaults to vcf (optional)
 1079        :return: The header of the vcf file.
 1080        """
 1081
 1082        if self.header_vcf:
 1083            if type == "vcf":
 1084                return self.header_vcf
 1085            elif type == "list":
 1086                return self.header_list
 1087        else:
 1088            if type == "vcf":
 1089                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1090                return header
 1091            elif type == "list":
 1092                return vcf_required
 1093
 1094    def get_header_length(self, file: str = None) -> int:
 1095        """
 1096        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1097        line.
 1098
 1099        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1100        header file. If this argument is provided, the function will read the header from the specified
 1101        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1102        :type file: str
 1103        :return: the length of the header list, excluding the #CHROM line.
 1104        """
 1105
 1106        if file:
 1107            return len(self.read_vcf_header_file(file=file)) - 1
 1108        elif self.get_header(type="list"):
 1109            return len(self.get_header(type="list")) - 1
 1110        else:
 1111            return 0
 1112
 1113    def get_header_columns(self) -> str:
 1114        """
 1115        This function returns the header list of a VCF
 1116
 1117        :return: The length of the header list.
 1118        """
 1119        if self.get_header():
 1120            return self.get_header(type="list")[-1]
 1121        else:
 1122            return ""
 1123
 1124    def get_header_columns_as_list(self) -> list:
 1125        """
 1126        This function returns the header list of a VCF
 1127
 1128        :return: The length of the header list.
 1129        """
 1130        if self.get_header():
 1131            return self.get_header_columns().strip().split("\t")
 1132        else:
 1133            return []
 1134
 1135    def get_header_columns_as_sql(self) -> str:
 1136        """
 1137        This function retruns header length (without #CHROM line)
 1138
 1139        :return: The length of the header list.
 1140        """
 1141        sql_column_list = []
 1142        for col in self.get_header_columns_as_list():
 1143            sql_column_list.append(f'"{col}"')
 1144        return ",".join(sql_column_list)
 1145
 1146    def get_header_sample_list(
 1147        self, check: bool = False, samples: list = None, samples_force: bool = False
 1148    ) -> list:
 1149        """
 1150        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1151        checking and filtering based on input parameters.
 1152
 1153        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1154        parameter that determines whether to check if the samples in the list are properly defined as
 1155        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1156        list is defined as a, defaults to False
 1157        :type check: bool (optional)
 1158        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1159        allows you to specify a subset of samples from the header. If you provide a list of sample
 1160        names, the function will check if each sample is defined in the header. If a sample is not found
 1161        in the
 1162        :type samples: list
 1163        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1164        a boolean parameter that determines whether to force the function to return the sample list
 1165        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1166        function will return the sample list without performing, defaults to False
 1167        :type samples_force: bool (optional)
 1168        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1169        parameters and conditions specified in the function.
 1170        """
 1171
 1172        # Init
 1173        samples_list = []
 1174
 1175        if samples is None:
 1176            samples_list = self.header_vcf.samples
 1177        else:
 1178            samples_checked = []
 1179            for sample in samples:
 1180                if sample in self.header_vcf.samples:
 1181                    samples_checked.append(sample)
 1182                else:
 1183                    log.warning(f"Sample '{sample}' not defined in header")
 1184            samples_list = samples_checked
 1185
 1186            # Force sample list without checking if is_genotype_column
 1187            if samples_force:
 1188                log.warning(f"Samples {samples_list} not checked if genotypes")
 1189                return samples_list
 1190
 1191        if check:
 1192            samples_checked = []
 1193            for sample in samples_list:
 1194                if self.is_genotype_column(column=sample):
 1195                    samples_checked.append(sample)
 1196                else:
 1197                    log.warning(
 1198                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1199                    )
 1200            samples_list = samples_checked
 1201
 1202        # Return samples list
 1203        return samples_list
 1204
 1205    def is_genotype_column(self, column: str = None) -> bool:
 1206        """
 1207        This function checks if a given column is a genotype column in a database.
 1208
 1209        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1210        represents the column name in a database table. This method checks if the specified column is a
 1211        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1212        method of
 1213        :type column: str
 1214        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1215        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1216        column name and returns the result. If the `column` parameter is None, it returns False.
 1217        """
 1218
 1219        if column is not None:
 1220            return Database(database=self.get_input()).is_genotype_column(column=column)
 1221        else:
 1222            return False
 1223
 1224    def get_verbose(self) -> bool:
 1225        """
 1226        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1227        exist
 1228
 1229        :return: The value of the key "verbose" in the config dictionary.
 1230        """
 1231        return self.get_config().get("verbose", False)
 1232
 1233    def get_connexion_format(self) -> str:
 1234        """
 1235        It returns the connexion format of the object.
 1236        :return: The connexion_format is being returned.
 1237        """
 1238        connexion_format = self.connexion_format
 1239        if connexion_format not in ["duckdb", "sqlite"]:
 1240            log.error(f"Unknown connexion format {connexion_format}")
 1241            raise ValueError(f"Unknown connexion format {connexion_format}")
 1242        else:
 1243            return connexion_format
 1244
 1245    def insert_file_to_table(
 1246        self,
 1247        file,
 1248        columns: str,
 1249        header_len: int = 0,
 1250        sep: str = "\t",
 1251        chunksize: int = 1000000,
 1252    ) -> None:
 1253        """
 1254        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1255        database format.
 1256
 1257        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1258        the path to the file on your system
 1259        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1260        should contain the names of the columns in the table where the data will be inserted. The column
 1261        names should be separated by commas within the string. For example, if you have columns named
 1262        "id", "name
 1263        :type columns: str
 1264        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1265        the number of lines to skip at the beginning of the file before reading the actual data. This
 1266        parameter allows you to skip any header information present in the file before processing the
 1267        data, defaults to 0
 1268        :type header_len: int (optional)
 1269        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1270        separator character that is used in the file being read. In this case, the default separator is
 1271        set to `\t`, which represents a tab character. You can change this parameter to a different
 1272        separator character if, defaults to \t
 1273        :type sep: str (optional)
 1274        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1275        when processing the file in chunks. In the provided code snippet, the default value for
 1276        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1277        to 1000000
 1278        :type chunksize: int (optional)
 1279        """
 1280
 1281        # Config
 1282        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1283        connexion_format = self.get_connexion_format()
 1284
 1285        log.debug("chunksize: " + str(chunksize))
 1286
 1287        if chunksize:
 1288            for chunk in pd.read_csv(
 1289                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1290            ):
 1291                if connexion_format in ["duckdb"]:
 1292                    sql_insert_into = (
 1293                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1294                    )
 1295                    self.conn.execute(sql_insert_into)
 1296                elif connexion_format in ["sqlite"]:
 1297                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1298
 1299    def load_data(
 1300        self,
 1301        input_file: str = None,
 1302        drop_variants_table: bool = False,
 1303        sample_size: int = 20480,
 1304    ) -> None:
 1305        """
 1306        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1307        table before loading the data and specify a sample size.
 1308
 1309        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1310        table
 1311        :type input_file: str
 1312        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1313        determines whether the variants table should be dropped before loading the data. If set to
 1314        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1315        not be dropped, defaults to False
 1316        :type drop_variants_table: bool (optional)
 1317        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1318        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1319        20480
 1320        :type sample_size: int (optional)
 1321        """
 1322
 1323        log.info("Loading...")
 1324
 1325        # change input file
 1326        if input_file:
 1327            self.set_input(input_file)
 1328            self.set_header()
 1329
 1330        # drop variants table
 1331        if drop_variants_table:
 1332            self.drop_variants_table()
 1333
 1334        # get table variants
 1335        table_variants = self.get_table_variants()
 1336
 1337        # Access
 1338        access = self.get_config().get("access", None)
 1339        log.debug(f"access: {access}")
 1340
 1341        # Input format and compress
 1342        input_format = self.get_input_format()
 1343        input_compressed = self.get_input_compressed()
 1344        log.debug(f"input_format: {input_format}")
 1345        log.debug(f"input_compressed: {input_compressed}")
 1346
 1347        # input_compressed_format
 1348        if input_compressed:
 1349            input_compressed_format = "gzip"
 1350        else:
 1351            input_compressed_format = "none"
 1352        log.debug(f"input_compressed_format: {input_compressed_format}")
 1353
 1354        # Connexion format
 1355        connexion_format = self.get_connexion_format()
 1356
 1357        # Sample size
 1358        if not sample_size:
 1359            sample_size = -1
 1360        log.debug(f"sample_size: {sample_size}")
 1361
 1362        # Load data
 1363        log.debug(f"Load Data from {input_format}")
 1364
 1365        # DuckDB connexion
 1366        if connexion_format in ["duckdb"]:
 1367
 1368            # Database already exists
 1369            if self.input_format in ["db", "duckdb"]:
 1370
 1371                if connexion_format in ["duckdb"]:
 1372                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1373                else:
 1374                    log.error(
 1375                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1376                    )
 1377                    raise ValueError(
 1378                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1379                    )
 1380
 1381            # Load from existing database format
 1382            else:
 1383
 1384                try:
 1385                    # Create Table or View
 1386                    database = Database(database=self.input)
 1387                    sql_from = database.get_sql_from(sample_size=sample_size)
 1388
 1389                    if access in ["RO"]:
 1390                        sql_load = (
 1391                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1392                        )
 1393                    else:
 1394                        sql_load = (
 1395                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1396                        )
 1397                    self.conn.execute(sql_load)
 1398
 1399                except:
 1400                    # Format not available
 1401                    log.error(f"Input file format '{self.input_format}' not available")
 1402                    raise ValueError(
 1403                        f"Input file format '{self.input_format}' not available"
 1404                    )
 1405
 1406        # SQLite connexion
 1407        elif connexion_format in ["sqlite"] and input_format in [
 1408            "vcf",
 1409            "tsv",
 1410            "csv",
 1411            "psv",
 1412        ]:
 1413
 1414            # Main structure
 1415            structure = {
 1416                "#CHROM": "VARCHAR",
 1417                "POS": "INTEGER",
 1418                "ID": "VARCHAR",
 1419                "REF": "VARCHAR",
 1420                "ALT": "VARCHAR",
 1421                "QUAL": "VARCHAR",
 1422                "FILTER": "VARCHAR",
 1423                "INFO": "VARCHAR",
 1424            }
 1425
 1426            # Strcuture with samples
 1427            structure_complete = structure
 1428            if self.get_header_sample_list():
 1429                structure["FORMAT"] = "VARCHAR"
 1430                for sample in self.get_header_sample_list():
 1431                    structure_complete[sample] = "VARCHAR"
 1432
 1433            # Columns list for create and insert
 1434            sql_create_table_columns = []
 1435            sql_create_table_columns_list = []
 1436            for column in structure_complete:
 1437                column_type = structure_complete[column]
 1438                sql_create_table_columns.append(
 1439                    f'"{column}" {column_type} default NULL'
 1440                )
 1441                sql_create_table_columns_list.append(f'"{column}"')
 1442
 1443            # Create database
 1444            log.debug(f"Create Table {table_variants}")
 1445            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1446            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1447            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1448            self.conn.execute(sql_create_table)
 1449
 1450            # chunksize define length of file chunk load file
 1451            chunksize = 100000
 1452
 1453            # delimiter
 1454            delimiter = file_format_delimiters.get(input_format, "\t")
 1455
 1456            # Load the input file
 1457            with open(self.input, "rt") as input_file:
 1458
 1459                # Use the appropriate file handler based on the input format
 1460                if input_compressed:
 1461                    input_file = bgzf.open(self.input, "rt")
 1462                if input_format in ["vcf"]:
 1463                    header_len = self.get_header_length()
 1464                else:
 1465                    header_len = 0
 1466
 1467                # Insert the file contents into a table
 1468                self.insert_file_to_table(
 1469                    input_file,
 1470                    columns=sql_create_table_columns_list_sql,
 1471                    header_len=header_len,
 1472                    sep=delimiter,
 1473                    chunksize=chunksize,
 1474                )
 1475
 1476        else:
 1477            log.error(
 1478                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1479            )
 1480            raise ValueError(
 1481                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1482            )
 1483
 1484        # Explode INFOS fields into table fields
 1485        if self.get_explode_infos():
 1486            self.explode_infos(
 1487                prefix=self.get_explode_infos_prefix(),
 1488                fields=self.get_explode_infos_fields(),
 1489                force=True,
 1490            )
 1491
 1492        # Create index after insertion
 1493        self.create_indexes()
 1494
 1495    def get_explode_infos(self) -> bool:
 1496        """
 1497        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1498        to False if it is not set.
 1499        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1500        value. If the parameter is not present, it will return False.
 1501        """
 1502
 1503        return self.get_param().get("explode", {}).get("explode_infos", False)
 1504
 1505    def get_explode_infos_fields(
 1506        self,
 1507        explode_infos_fields: str = None,
 1508        remove_fields_not_in_header: bool = False,
 1509    ) -> list:
 1510        """
 1511        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1512        the input parameter `explode_infos_fields`.
 1513
 1514        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1515        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1516        comma-separated list of field names to explode
 1517        :type explode_infos_fields: str
 1518        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1519        flag that determines whether to remove fields that are not present in the header. If it is set
 1520        to `True`, any field that is not in the header will be excluded from the list of exploded
 1521        information fields. If it is set to `, defaults to False
 1522        :type remove_fields_not_in_header: bool (optional)
 1523        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1524        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1525        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1526        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1527        splitting the string by commas.
 1528        """
 1529
 1530        # If no fields, get it in param
 1531        if not explode_infos_fields:
 1532            explode_infos_fields = (
 1533                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1534            )
 1535
 1536        # If no fields, defined as all fields in header using keyword
 1537        if not explode_infos_fields:
 1538            explode_infos_fields = "*"
 1539
 1540        # If fields list not empty
 1541        if explode_infos_fields:
 1542
 1543            # Input fields list
 1544            if isinstance(explode_infos_fields, str):
 1545                fields_input = explode_infos_fields.split(",")
 1546            elif isinstance(explode_infos_fields, list):
 1547                fields_input = explode_infos_fields
 1548            else:
 1549                fields_input = []
 1550
 1551            # Fields list without * keyword
 1552            fields_without_all = fields_input.copy()
 1553            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1554                fields_without_all.remove("*")
 1555
 1556            # Fields in header
 1557            fields_in_header = sorted(list(set(self.get_header().infos)))
 1558
 1559            # Construct list of fields
 1560            fields_output = []
 1561            for field in fields_input:
 1562
 1563                # Strip field
 1564                field = field.strip()
 1565
 1566                # format keyword * in regex
 1567                if field.upper() in ["*"]:
 1568                    field = ".*"
 1569
 1570                # Find all fields with pattern
 1571                r = re.compile(field)
 1572                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1573
 1574                # Remove fields input from search
 1575                if field in fields_search:
 1576                    fields_search = [field]
 1577                elif fields_search != [field]:
 1578                    fields_search = sorted(
 1579                        list(set(fields_search).difference(fields_input))
 1580                    )
 1581
 1582                # If field is not in header (avoid not well formatted header)
 1583                if not fields_search and not remove_fields_not_in_header:
 1584                    fields_search = [field]
 1585
 1586                # Add found fields
 1587                for new_field in fields_search:
 1588                    # Add field, if not already exists, and if it is in header (if asked)
 1589                    if (
 1590                        new_field not in fields_output
 1591                        and (
 1592                            not remove_fields_not_in_header
 1593                            or new_field in fields_in_header
 1594                        )
 1595                        and new_field not in [".*"]
 1596                    ):
 1597                        fields_output.append(new_field)
 1598
 1599            return fields_output
 1600
 1601        else:
 1602
 1603            return []
 1604
 1605    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1606        """
 1607        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1608        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1609        not provided.
 1610
 1611        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1612        prefix to be used for exploding or expanding information
 1613        :type explode_infos_prefix: str
 1614        :return: the value of the variable `explode_infos_prefix`.
 1615        """
 1616
 1617        if not explode_infos_prefix:
 1618            explode_infos_prefix = (
 1619                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1620            )
 1621
 1622        return explode_infos_prefix
 1623
 1624    def add_column(
 1625        self,
 1626        table_name,
 1627        column_name,
 1628        column_type,
 1629        default_value=None,
 1630        drop: bool = False,
 1631    ) -> dict:
 1632        """
 1633        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1634        doesn't already exist.
 1635
 1636        :param table_name: The name of the table to which you want to add a column
 1637        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1638        to the table
 1639        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1640        want to add to the table. It should be a string that represents the desired data type, such as
 1641        "INTEGER", "TEXT", "REAL", etc
 1642        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1643        default value for the newly added column. If a default value is provided, it will be assigned to
 1644        the column for any existing rows that do not have a value for that column
 1645        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1646        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1647        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1648        to False
 1649        :type drop: bool (optional)
 1650        :return: a boolean value indicating whether the column was successfully added to the table.
 1651        """
 1652
 1653        # added
 1654        added = False
 1655        dropped = False
 1656
 1657        # Check if the column already exists in the table
 1658        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1659        columns = self.get_query_to_df(query).columns.tolist()
 1660        if column_name.upper() in [c.upper() for c in columns]:
 1661            log.debug(
 1662                f"The {column_name} column already exists in the {table_name} table"
 1663            )
 1664            if drop:
 1665                self.drop_column(table_name=table_name, column_name=column_name)
 1666                dropped = True
 1667            else:
 1668                return None
 1669        else:
 1670            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1671
 1672        # Add column in table
 1673        add_column_query = (
 1674            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1675        )
 1676        if default_value is not None:
 1677            add_column_query += f" DEFAULT {default_value}"
 1678        self.execute_query(add_column_query)
 1679        added = not dropped
 1680        log.debug(
 1681            f"The {column_name} column was successfully added to the {table_name} table"
 1682        )
 1683
 1684        if added:
 1685            added_column = {
 1686                "table_name": table_name,
 1687                "column_name": column_name,
 1688                "column_type": column_type,
 1689                "default_value": default_value,
 1690            }
 1691        else:
 1692            added_column = None
 1693
 1694        return added_column
 1695
 1696    def drop_column(
 1697        self, column: dict = None, table_name: str = None, column_name: str = None
 1698    ) -> bool:
 1699        """
 1700        The `drop_column` function drops a specified column from a given table in a database and returns
 1701        True if the column was successfully dropped, and False if the column does not exist in the
 1702        table.
 1703
 1704        :param column: The `column` parameter is a dictionary that contains information about the column
 1705        you want to drop. It has two keys:
 1706        :type column: dict
 1707        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1708        drop a column
 1709        :type table_name: str
 1710        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1711        from the table
 1712        :type column_name: str
 1713        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1714        and False if the column does not exist in the table.
 1715        """
 1716
 1717        # Find column infos
 1718        if column:
 1719            if isinstance(column, dict):
 1720                table_name = column.get("table_name", None)
 1721                column_name = column.get("column_name", None)
 1722            elif isinstance(column, str):
 1723                table_name = self.get_table_variants()
 1724                column_name = column
 1725            else:
 1726                table_name = None
 1727                column_name = None
 1728
 1729        if not table_name and not column_name:
 1730            return False
 1731
 1732        # Removed
 1733        removed = False
 1734
 1735        # Check if the column already exists in the table
 1736        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1737        columns = self.get_query_to_df(query).columns.tolist()
 1738        if column_name in columns:
 1739            log.debug(f"The {column_name} column exists in the {table_name} table")
 1740        else:
 1741            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1742            return False
 1743
 1744        # Add column in table # ALTER TABLE integers DROP k
 1745        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1746        self.execute_query(add_column_query)
 1747        removed = True
 1748        log.debug(
 1749            f"The {column_name} column was successfully dropped to the {table_name} table"
 1750        )
 1751
 1752        return removed
 1753
 1754    def explode_infos(
 1755        self,
 1756        prefix: str = None,
 1757        create_index: bool = False,
 1758        fields: list = None,
 1759        force: bool = False,
 1760        proccess_all_fields_together: bool = False,
 1761        table: str = None,
 1762    ) -> list:
 1763        """
 1764        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1765        individual columns, returning a list of added columns.
 1766
 1767        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1768        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1769        `self.get_explode_infos_prefix()` as the prefix
 1770        :type prefix: str
 1771        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1772        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1773        `False`, indexes will not be created. The default value is `False`, defaults to False
 1774        :type create_index: bool (optional)
 1775        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1776        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1777        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1778        a list to the `
 1779        :type fields: list
 1780        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1781        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1782        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1783        defaults to False
 1784        :type force: bool (optional)
 1785        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1786        flag that determines whether to process all the INFO fields together or individually. If set to
 1787        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1788        be processed individually. The default value is, defaults to False
 1789        :type proccess_all_fields_together: bool (optional)
 1790        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1791        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1792        a value for the `table` parameter, the function will use that table name. If the `table`
 1793        parameter is
 1794        :type table: str
 1795        :return: The `explode_infos` function returns a list of added columns.
 1796        """
 1797
 1798        # drop indexes
 1799        self.drop_indexes()
 1800
 1801        # connexion format
 1802        connexion_format = self.get_connexion_format()
 1803
 1804        # Access
 1805        access = self.get_config().get("access", None)
 1806
 1807        # Added columns
 1808        added_columns = []
 1809
 1810        if access not in ["RO"]:
 1811
 1812            # prefix
 1813            if prefix in [None, True] or not isinstance(prefix, str):
 1814                if self.get_explode_infos_prefix() not in [None, True]:
 1815                    prefix = self.get_explode_infos_prefix()
 1816                else:
 1817                    prefix = "INFO/"
 1818
 1819            # table variants
 1820            if table is not None:
 1821                table_variants = table
 1822            else:
 1823                table_variants = self.get_table_variants(clause="select")
 1824
 1825            # extra infos
 1826            try:
 1827                extra_infos = self.get_extra_infos()
 1828            except:
 1829                extra_infos = []
 1830
 1831            # Header infos
 1832            header_infos = self.get_header().infos
 1833
 1834            log.debug(
 1835                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1836            )
 1837
 1838            sql_info_alter_table_array = []
 1839
 1840            # Info fields to check
 1841            fields_list = list(header_infos)
 1842            if fields:
 1843                fields_list += fields
 1844            fields_list = set(fields_list)
 1845
 1846            # If no fields
 1847            if not fields:
 1848                fields = []
 1849
 1850            # Translate fields if patterns
 1851            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1852
 1853            for info in fields:
 1854
 1855                info_id_sql = prefix + info
 1856
 1857                if (
 1858                    info in fields_list
 1859                    or prefix + info in fields_list
 1860                    or info in extra_infos
 1861                ):
 1862
 1863                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1864
 1865                    if info in header_infos:
 1866                        info_type = header_infos[info].type
 1867                        info_num = header_infos[info].num
 1868                    else:
 1869                        info_type = "String"
 1870                        info_num = 0
 1871
 1872                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1873                    if info_num != 1:
 1874                        type_sql = "VARCHAR"
 1875
 1876                    # Add field
 1877                    added_column = self.add_column(
 1878                        table_name=table_variants,
 1879                        column_name=info_id_sql,
 1880                        column_type=type_sql,
 1881                        default_value="null",
 1882                        drop=force,
 1883                    )
 1884
 1885                    if added_column:
 1886                        added_columns.append(added_column)
 1887
 1888                    if added_column or force:
 1889
 1890                        # add field to index
 1891                        self.index_additionnal_fields.append(info_id_sql)
 1892
 1893                        # Update field array
 1894                        if connexion_format in ["duckdb"]:
 1895                            update_info_field = f"""
 1896                            "{info_id_sql}" =
 1897                                CASE
 1898                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1899                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1900                                END
 1901                            """
 1902                        elif connexion_format in ["sqlite"]:
 1903                            update_info_field = f"""
 1904                                "{info_id_sql}" =
 1905                                    CASE
 1906                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1907                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1908                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1909                                    END
 1910                            """
 1911
 1912                        sql_info_alter_table_array.append(update_info_field)
 1913
 1914            if sql_info_alter_table_array:
 1915
 1916                # By chromosomes
 1917                try:
 1918                    chromosomes_list = list(
 1919                        self.get_query_to_df(
 1920                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1921                        )["#CHROM"]
 1922                    )
 1923                except:
 1924                    chromosomes_list = [None]
 1925
 1926                for chrom in chromosomes_list:
 1927                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1928
 1929                    # Where clause
 1930                    where_clause = ""
 1931                    if chrom and len(chromosomes_list) > 1:
 1932                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1933
 1934                    # Update table
 1935                    if proccess_all_fields_together:
 1936                        sql_info_alter_table_array_join = ", ".join(
 1937                            sql_info_alter_table_array
 1938                        )
 1939                        if sql_info_alter_table_array_join:
 1940                            sql_info_alter_table = f"""
 1941                                UPDATE {table_variants}
 1942                                SET {sql_info_alter_table_array_join}
 1943                                {where_clause}
 1944                                """
 1945                            log.debug(
 1946                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1947                            )
 1948                            # log.debug(sql_info_alter_table)
 1949                            self.conn.execute(sql_info_alter_table)
 1950                    else:
 1951                        sql_info_alter_num = 0
 1952                        for sql_info_alter in sql_info_alter_table_array:
 1953                            sql_info_alter_num += 1
 1954                            sql_info_alter_table = f"""
 1955                                UPDATE {table_variants}
 1956                                SET {sql_info_alter}
 1957                                {where_clause}
 1958                                """
 1959                            log.debug(
 1960                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1961                            )
 1962                            # log.debug(sql_info_alter_table)
 1963                            self.conn.execute(sql_info_alter_table)
 1964
 1965        # create indexes
 1966        if create_index:
 1967            self.create_indexes()
 1968
 1969        return added_columns
 1970
 1971    def create_indexes(self) -> None:
 1972        """
 1973        Create indexes on the table after insertion
 1974        """
 1975
 1976        # Access
 1977        access = self.get_config().get("access", None)
 1978
 1979        # get table variants
 1980        table_variants = self.get_table_variants("FROM")
 1981
 1982        if self.get_indexing() and access not in ["RO"]:
 1983            # Create index
 1984            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1985            self.conn.execute(sql_create_table_index)
 1986            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1987            self.conn.execute(sql_create_table_index)
 1988            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1989            self.conn.execute(sql_create_table_index)
 1990            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1991            self.conn.execute(sql_create_table_index)
 1992            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1993            self.conn.execute(sql_create_table_index)
 1994            for field in self.index_additionnal_fields:
 1995                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1996                self.conn.execute(sql_create_table_index)
 1997
 1998    def drop_indexes(self) -> None:
 1999        """
 2000        Create indexes on the table after insertion
 2001        """
 2002
 2003        # Access
 2004        access = self.get_config().get("access", None)
 2005
 2006        # get table variants
 2007        table_variants = self.get_table_variants("FROM")
 2008
 2009        # Get database format
 2010        connexion_format = self.get_connexion_format()
 2011
 2012        if access not in ["RO"]:
 2013            if connexion_format in ["duckdb"]:
 2014                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2015            elif connexion_format in ["sqlite"]:
 2016                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2017
 2018            list_indexes = self.conn.execute(sql_list_indexes)
 2019            index_names = [row[0] for row in list_indexes.fetchall()]
 2020            for index in index_names:
 2021                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2022                self.conn.execute(sql_drop_table_index)
 2023
 2024    def read_vcf_header(self, f) -> list:
 2025        """
 2026        It reads the header of a VCF file and returns a list of the header lines
 2027
 2028        :param f: the file object
 2029        :return: The header lines of the VCF file.
 2030        """
 2031
 2032        header_list = []
 2033        for line in f:
 2034            header_list.append(line)
 2035            if line.startswith("#CHROM"):
 2036                break
 2037        return header_list
 2038
 2039    def read_vcf_header_file(self, file: str = None) -> list:
 2040        """
 2041        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2042        uncompressed files.
 2043
 2044        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2045        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2046        default to `None`
 2047        :type file: str
 2048        :return: The function `read_vcf_header_file` returns a list.
 2049        """
 2050
 2051        if self.get_input_compressed(input_file=file):
 2052            with bgzf.open(file, "rt") as f:
 2053                return self.read_vcf_header(f=f)
 2054        else:
 2055            with open(file, "rt") as f:
 2056                return self.read_vcf_header(f=f)
 2057
 2058    def execute_query(self, query: str):
 2059        """
 2060        It takes a query as an argument, executes it, and returns the results
 2061
 2062        :param query: The query to be executed
 2063        :return: The result of the query is being returned.
 2064        """
 2065        if query:
 2066            return self.conn.execute(query)  # .fetchall()
 2067        else:
 2068            return None
 2069
 2070    def export_output(
 2071        self,
 2072        output_file: str | None = None,
 2073        output_header: str | None = None,
 2074        export_header: bool = True,
 2075        query: str | None = None,
 2076        parquet_partitions: list | None = None,
 2077        chunk_size: int | None = None,
 2078        threads: int | None = None,
 2079        sort: bool = False,
 2080        index: bool = False,
 2081        order_by: str | None = None,
 2082    ) -> bool:
 2083        """
 2084        The `export_output` function exports data from a VCF file to a specified output file in various
 2085        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2086
 2087        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2088        output file to be generated by the function. This is where the exported data will be saved
 2089        :type output_file: str
 2090        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2091        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2092        header will be exported to a file with the same name as the `output_file` parameter, but with
 2093        the extension "
 2094        :type output_header: str
 2095        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2096        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2097        True, the header will be exported to a file. If `export_header` is False, the header will not
 2098        be, defaults to True, if output format is not VCF
 2099        :type export_header: bool (optional)
 2100        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2101        select specific data from the VCF file before exporting it. If provided, only the data that
 2102        matches the query will be exported
 2103        :type query: str
 2104        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2105        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2106        organize data in a hierarchical directory structure based on the values of one or more columns.
 2107        This can improve query performance when working with large datasets
 2108        :type parquet_partitions: list
 2109        :param chunk_size: The `chunk_size` parameter specifies the number of
 2110        records in batch when exporting data in Parquet format. This parameter is used for
 2111        partitioning the Parquet file into multiple files.
 2112        :type chunk_size: int
 2113        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2114        threads to be used during the export process. It determines the level of parallelism and can
 2115        improve the performance of the export operation. If not provided, the function will use the
 2116        default number of threads
 2117        :type threads: int
 2118        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2119        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2120        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2121        False
 2122        :type sort: bool (optional)
 2123        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2124        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2125        no index will be created. The default value is False, defaults to False
 2126        :type index: bool (optional)
 2127        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2128        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2129        :type order_by: str
 2130        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2131        None if it doesn't.
 2132        """
 2133
 2134        # Log
 2135        log.info("Exporting...")
 2136
 2137        # Full path
 2138        output_file = full_path(output_file)
 2139        output_header = full_path(output_header)
 2140
 2141        # Config
 2142        config = self.get_config()
 2143
 2144        # Param
 2145        param = self.get_param()
 2146
 2147        # Tmp files to remove
 2148        tmp_to_remove = []
 2149
 2150        # If no output, get it
 2151        if not output_file:
 2152            output_file = self.get_output()
 2153
 2154        # If not threads
 2155        if not threads:
 2156            threads = self.get_threads()
 2157
 2158        # Auto header name with extension
 2159        if export_header or output_header:
 2160            if not output_header:
 2161                output_header = f"{output_file}.hdr"
 2162            # Export header
 2163            self.export_header(output_file=output_file)
 2164
 2165        # Switch off export header if VCF output
 2166        output_file_type = get_file_format(output_file)
 2167        if output_file_type in ["vcf"]:
 2168            export_header = False
 2169            tmp_to_remove.append(output_header)
 2170
 2171        # Chunk size
 2172        if not chunk_size:
 2173            chunk_size = config.get("chunk_size", None)
 2174
 2175        # Parquet partition
 2176        if not parquet_partitions:
 2177            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2178        if parquet_partitions and isinstance(parquet_partitions, str):
 2179            parquet_partitions = parquet_partitions.split(",")
 2180
 2181        # Order by
 2182        if not order_by:
 2183            order_by = param.get("export", {}).get("order_by", "")
 2184
 2185        # Header in output
 2186        header_in_output = param.get("export", {}).get("include_header", False)
 2187
 2188        # Database
 2189        database_source = self.get_connexion()
 2190
 2191        # Connexion format
 2192        connexion_format = self.get_connexion_format()
 2193
 2194        # Explode infos
 2195        if self.get_explode_infos():
 2196            self.explode_infos(
 2197                prefix=self.get_explode_infos_prefix(),
 2198                fields=self.get_explode_infos_fields(),
 2199                force=False,
 2200            )
 2201
 2202        # if connexion_format in ["sqlite"] or query:
 2203        if connexion_format in ["sqlite"]:
 2204
 2205            # Export in Parquet
 2206            random_tmp = "".join(
 2207                random.choice(string.ascii_lowercase) for i in range(10)
 2208            )
 2209            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2210            tmp_to_remove.append(database_source)
 2211
 2212            # Table Variants
 2213            table_variants = self.get_table_variants()
 2214
 2215            # Create export query
 2216            sql_query_export_subquery = f"""
 2217                SELECT * FROM {table_variants}
 2218                """
 2219
 2220            # Write source file
 2221            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2222
 2223        # Create database
 2224        database = Database(
 2225            database=database_source,
 2226            table="variants",
 2227            header_file=output_header,
 2228            conn_config=self.get_connexion_config(),
 2229        )
 2230
 2231        # Existing colomns header
 2232        existing_columns_header = database.get_header_columns_from_database()
 2233
 2234        # Sample list
 2235        get_samples = self.get_samples()
 2236        get_samples_check = self.get_samples_check()
 2237        samples_force = get_samples is not None
 2238        sample_list = self.get_header_sample_list(
 2239            check=get_samples_check, samples=get_samples, samples_force=samples_force
 2240        )
 2241
 2242        # Export file
 2243        database.export(
 2244            output_database=output_file,
 2245            output_header=output_header,
 2246            existing_columns_header=existing_columns_header,
 2247            parquet_partitions=parquet_partitions,
 2248            chunk_size=chunk_size,
 2249            threads=threads,
 2250            sort=sort,
 2251            index=index,
 2252            header_in_output=header_in_output,
 2253            order_by=order_by,
 2254            query=query,
 2255            export_header=export_header,
 2256            sample_list=sample_list,
 2257        )
 2258
 2259        # Remove
 2260        remove_if_exists(tmp_to_remove)
 2261
 2262        return (os.path.exists(output_file) or None) and (
 2263            os.path.exists(output_file) or None
 2264        )
 2265
 2266    def get_extra_infos(self, table: str = None) -> list:
 2267        """
 2268        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2269        in the header.
 2270
 2271        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2272        name of the table from which you want to retrieve the extra columns that are not present in the
 2273        header. If the `table` parameter is not provided when calling the function, it will default to
 2274        using the variants
 2275        :type table: str
 2276        :return: A list of columns that are in the specified table but not in the header of the table.
 2277        """
 2278
 2279        header_columns = []
 2280
 2281        if not table:
 2282            table = self.get_table_variants(clause="from")
 2283            header_columns = self.get_header_columns()
 2284
 2285        # Check all columns in the database
 2286        query = f""" SELECT * FROM {table} LIMIT 1 """
 2287        log.debug(f"query {query}")
 2288        table_columns = self.get_query_to_df(query).columns.tolist()
 2289        extra_columns = []
 2290
 2291        # Construct extra infos (not in header)
 2292        for column in table_columns:
 2293            if column not in header_columns:
 2294                extra_columns.append(column)
 2295
 2296        return extra_columns
 2297
 2298    def get_extra_infos_sql(self, table: str = None) -> str:
 2299        """
 2300        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2301        by double quotes
 2302
 2303        :param table: The name of the table to get the extra infos from. If None, the default table is
 2304        used
 2305        :type table: str
 2306        :return: A string of the extra infos
 2307        """
 2308
 2309        return ", ".join(
 2310            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2311        )
 2312
 2313    def export_header(
 2314        self,
 2315        header_name: str = None,
 2316        output_file: str = None,
 2317        output_file_ext: str = ".hdr",
 2318        clean_header: bool = True,
 2319        remove_chrom_line: bool = False,
 2320    ) -> str:
 2321        """
 2322        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2323        specified options, and writes it to a new file.
 2324
 2325        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2326        this parameter is not specified, the header will be written to the output file
 2327        :type header_name: str
 2328        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2329        specify the name of the output file where the header will be written. If this parameter is not
 2330        provided, the header will be written to a temporary file
 2331        :type output_file: str
 2332        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2333        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2334        if not specified by the user. This extension will be appended to the `output_file` name to
 2335        create the final, defaults to .hdr
 2336        :type output_file_ext: str (optional)
 2337        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2338        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2339        `True`, the function will clean the header by modifying certain lines based on a specific
 2340        pattern. If `clean_header`, defaults to True
 2341        :type clean_header: bool (optional)
 2342        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2343        boolean flag that determines whether the #CHROM line should be removed from the header before
 2344        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2345        defaults to False
 2346        :type remove_chrom_line: bool (optional)
 2347        :return: The function `export_header` returns the name of the temporary header file that is
 2348        created.
 2349        """
 2350
 2351        if not header_name and not output_file:
 2352            output_file = self.get_output()
 2353
 2354        if self.get_header():
 2355
 2356            # Get header object
 2357            header_obj = self.get_header()
 2358
 2359            # Create database
 2360            db_for_header = Database(database=self.get_input())
 2361
 2362            # Get real columns in the file
 2363            db_header_columns = db_for_header.get_columns()
 2364
 2365            with tempfile.TemporaryDirectory() as tmpdir:
 2366
 2367                # Write header file
 2368                header_file_tmp = os.path.join(tmpdir, "header")
 2369                f = open(header_file_tmp, "w")
 2370                vcf.Writer(f, header_obj)
 2371                f.close()
 2372
 2373                # Replace #CHROM line with rel columns
 2374                header_list = db_for_header.read_header_file(
 2375                    header_file=header_file_tmp
 2376                )
 2377                header_list[-1] = "\t".join(db_header_columns)
 2378
 2379                # Remove CHROM line
 2380                if remove_chrom_line:
 2381                    header_list.pop()
 2382
 2383                # Clean header
 2384                if clean_header:
 2385                    header_list_clean = []
 2386                    for head in header_list:
 2387                        # Clean head for malformed header
 2388                        head_clean = head
 2389                        head_clean = re.subn(
 2390                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2391                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2392                            head_clean,
 2393                            2,
 2394                        )[0]
 2395                        # Write header
 2396                        header_list_clean.append(head_clean)
 2397                    header_list = header_list_clean
 2398
 2399            tmp_header_name = output_file + output_file_ext
 2400
 2401            f = open(tmp_header_name, "w")
 2402            for line in header_list:
 2403                f.write(line)
 2404            f.close()
 2405
 2406        return tmp_header_name
 2407
 2408    def export_variant_vcf(
 2409        self,
 2410        vcf_file,
 2411        remove_info: bool = False,
 2412        add_samples: bool = True,
 2413        list_samples: list = [],
 2414        where_clause: str = "",
 2415        index: bool = False,
 2416        threads: int | None = None,
 2417    ) -> bool | None:
 2418        """
 2419        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2420        remove INFO field, add samples, and control compression and indexing.
 2421
 2422        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2423        written to. It is the output file that will contain the filtered VCF data based on the specified
 2424        parameters
 2425        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2426        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2427        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2428        in, defaults to False
 2429        :type remove_info: bool (optional)
 2430        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2431        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2432        If set to False, the samples will be removed. The default value is True, defaults to True
 2433        :type add_samples: bool (optional)
 2434        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2435        in the output VCF file. By default, all samples will be included. If you provide a list of
 2436        samples, only those samples will be included in the output file
 2437        :type list_samples: list
 2438        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2439        determines whether or not to create an index for the output VCF file. If `index` is set to
 2440        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2441        :type index: bool (optional)
 2442        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2443        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2444        will be used during the export process. More threads can potentially speed up the export process
 2445        by utilizing multiple cores of the processor. If
 2446        :type threads: int | None
 2447        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2448        method with various parameters including the output file, query, threads, sort flag, and index
 2449        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2450        specified parameters and configurations provided in the `export_variant_vcf` function.
 2451        """
 2452
 2453        # Config
 2454        config = self.get_config()
 2455
 2456        # Extract VCF
 2457        log.debug("Export VCF...")
 2458
 2459        # Table variants
 2460        table_variants = self.get_table_variants()
 2461
 2462        # Threads
 2463        if not threads:
 2464            threads = self.get_threads()
 2465
 2466        # Info fields
 2467        if remove_info:
 2468            if not isinstance(remove_info, str):
 2469                remove_info = "."
 2470            info_field = f"""'{remove_info}' as INFO"""
 2471        else:
 2472            info_field = "INFO"
 2473
 2474        # Samples fields
 2475        if add_samples:
 2476            if not list_samples:
 2477                list_samples = self.get_header_sample_list()
 2478            if list_samples:
 2479                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2480            else:
 2481                samples_fields = ""
 2482            log.debug(f"samples_fields: {samples_fields}")
 2483        else:
 2484            samples_fields = ""
 2485
 2486        # Where clause
 2487        if where_clause is None:
 2488            where_clause = ""
 2489
 2490        # Variants
 2491        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2492        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2493        log.debug(f"sql_query_select={sql_query_select}")
 2494
 2495        return self.export_output(
 2496            output_file=vcf_file,
 2497            output_header=None,
 2498            export_header=True,
 2499            query=sql_query_select,
 2500            parquet_partitions=None,
 2501            chunk_size=config.get("chunk_size", None),
 2502            threads=threads,
 2503            sort=True,
 2504            index=index,
 2505            order_by=None,
 2506        )
 2507
 2508    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2509        """
 2510        It takes a list of commands and runs them in parallel using the number of threads specified
 2511
 2512        :param commands: A list of commands to run
 2513        :param threads: The number of threads to use, defaults to 1 (optional)
 2514        """
 2515
 2516        run_parallel_commands(commands, threads)
 2517
 2518    def get_threads(self, default: int = 1) -> int:
 2519        """
 2520        This function returns the number of threads to use for a job, with a default value of 1 if not
 2521        specified.
 2522
 2523        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2524        default number of threads to use if no specific value is provided. If no value is provided for
 2525        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2526        used, defaults to 1
 2527        :type default: int (optional)
 2528        :return: the number of threads to use for the current job.
 2529        """
 2530
 2531        # Config
 2532        config = self.get_config()
 2533
 2534        # Param
 2535        param = self.get_param()
 2536
 2537        # Input threads
 2538        input_thread = param.get("threads", config.get("threads", None))
 2539
 2540        # Check threads
 2541        if not input_thread:
 2542            threads = default
 2543        elif int(input_thread) <= 0:
 2544            threads = os.cpu_count()
 2545        else:
 2546            threads = int(input_thread)
 2547        return threads
 2548
 2549    def get_memory(self, default: str = None) -> str:
 2550        """
 2551        This function retrieves the memory value from parameters or configuration with a default value
 2552        if not found.
 2553
 2554        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2555        default value is used as a fallback in case the `memory` parameter is not provided in the
 2556        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2557        the function
 2558        :type default: str
 2559        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2560        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2561        return the default value provided as an argument to the function.
 2562        """
 2563
 2564        # Config
 2565        config = self.get_config()
 2566
 2567        # Param
 2568        param = self.get_param()
 2569
 2570        # Input threads
 2571        input_memory = param.get("memory", config.get("memory", None))
 2572
 2573        # Check threads
 2574        if input_memory:
 2575            memory = input_memory
 2576        else:
 2577            memory = default
 2578
 2579        return memory
 2580
 2581    def update_from_vcf(self, vcf_file: str) -> None:
 2582        """
 2583        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2584
 2585        :param vcf_file: the path to the VCF file
 2586        """
 2587
 2588        connexion_format = self.get_connexion_format()
 2589
 2590        if connexion_format in ["duckdb"]:
 2591            self.update_from_vcf_duckdb(vcf_file)
 2592        elif connexion_format in ["sqlite"]:
 2593            self.update_from_vcf_sqlite(vcf_file)
 2594
 2595    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2596        """
 2597        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2598        INFO column of the VCF file
 2599
 2600        :param vcf_file: the path to the VCF file
 2601        """
 2602
 2603        # varaints table
 2604        table_variants = self.get_table_variants()
 2605
 2606        # Loading VCF into temporaire table
 2607        skip = self.get_header_length(file=vcf_file)
 2608        vcf_df = pd.read_csv(
 2609            vcf_file,
 2610            sep="\t",
 2611            engine="c",
 2612            skiprows=skip,
 2613            header=0,
 2614            low_memory=False,
 2615        )
 2616        sql_query_update = f"""
 2617        UPDATE {table_variants} as table_variants
 2618            SET INFO = concat(
 2619                            CASE
 2620                                WHEN INFO NOT IN ('', '.')
 2621                                THEN INFO
 2622                                ELSE ''
 2623                            END,
 2624                            (
 2625                                SELECT 
 2626                                    concat(
 2627                                        CASE
 2628                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2629                                            THEN ';'
 2630                                            ELSE ''
 2631                                        END
 2632                                        ,
 2633                                        CASE
 2634                                            WHEN table_parquet.INFO NOT IN ('','.')
 2635                                            THEN table_parquet.INFO
 2636                                            ELSE ''
 2637                                        END
 2638                                    )
 2639                                FROM vcf_df as table_parquet
 2640                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2641                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2642                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2643                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2644                                        AND table_parquet.INFO NOT IN ('','.')
 2645                            )
 2646                        )
 2647            ;
 2648            """
 2649        self.conn.execute(sql_query_update)
 2650
 2651    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2652        """
 2653        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2654        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2655        table
 2656
 2657        :param vcf_file: The path to the VCF file you want to update the database with
 2658        """
 2659
 2660        # Create a temporary table for the VCF
 2661        table_vcf = "tmp_vcf"
 2662        sql_create = (
 2663            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2664        )
 2665        self.conn.execute(sql_create)
 2666
 2667        # Loading VCF into temporaire table
 2668        vcf_df = pd.read_csv(
 2669            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2670        )
 2671        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2672        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2673
 2674        # Update table 'variants' with VCF data
 2675        # warning: CONCAT as || operator
 2676        sql_query_update = f"""
 2677            UPDATE variants as table_variants
 2678            SET INFO = CASE
 2679                            WHEN INFO NOT IN ('', '.')
 2680                            THEN INFO
 2681                            ELSE ''
 2682                        END ||
 2683                        (
 2684                        SELECT 
 2685                            CASE 
 2686                                WHEN table_variants.INFO NOT IN ('','.') 
 2687                                    AND table_vcf.INFO NOT IN ('','.')  
 2688                                THEN ';' 
 2689                                ELSE '' 
 2690                            END || 
 2691                            CASE 
 2692                                WHEN table_vcf.INFO NOT IN ('','.') 
 2693                                THEN table_vcf.INFO 
 2694                                ELSE '' 
 2695                            END
 2696                        FROM {table_vcf} as table_vcf
 2697                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2698                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2699                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2700                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2701                        )
 2702        """
 2703        self.conn.execute(sql_query_update)
 2704
 2705        # Drop temporary table
 2706        sql_drop = f"DROP TABLE {table_vcf}"
 2707        self.conn.execute(sql_drop)
 2708
 2709    def drop_variants_table(self) -> None:
 2710        """
 2711        > This function drops the variants table
 2712        """
 2713
 2714        table_variants = self.get_table_variants()
 2715        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2716        self.conn.execute(sql_table_variants)
 2717
 2718    def set_variant_id(
 2719        self, variant_id_column: str = "variant_id", force: bool = None
 2720    ) -> str:
 2721        """
 2722        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2723        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2724
 2725        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2726        to variant_id
 2727        :type variant_id_column: str (optional)
 2728        :param force: If True, the variant_id column will be created even if it already exists
 2729        :type force: bool
 2730        :return: The name of the column that contains the variant_id
 2731        """
 2732
 2733        # Assembly
 2734        assembly = self.get_param().get(
 2735            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2736        )
 2737
 2738        # INFO/Tag prefix
 2739        prefix = self.get_explode_infos_prefix()
 2740
 2741        # Explode INFO/SVTYPE
 2742        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2743
 2744        # variants table
 2745        table_variants = self.get_table_variants()
 2746
 2747        # variant_id column
 2748        if not variant_id_column:
 2749            variant_id_column = "variant_id"
 2750
 2751        # Creta variant_id column
 2752        if "variant_id" not in self.get_extra_infos() or force:
 2753
 2754            # Create column
 2755            self.add_column(
 2756                table_name=table_variants,
 2757                column_name=variant_id_column,
 2758                column_type="UBIGINT",
 2759                default_value="0",
 2760            )
 2761
 2762            # Update column
 2763            self.conn.execute(
 2764                f"""
 2765                    UPDATE {table_variants}
 2766                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2767                """
 2768            )
 2769
 2770        # Remove added columns
 2771        for added_column in added_columns:
 2772            self.drop_column(column=added_column)
 2773
 2774        # return variant_id column name
 2775        return variant_id_column
 2776
 2777    def get_variant_id_column(
 2778        self, variant_id_column: str = "variant_id", force: bool = None
 2779    ) -> str:
 2780        """
 2781        This function returns the variant_id column name
 2782
 2783        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2784        defaults to variant_id
 2785        :type variant_id_column: str (optional)
 2786        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2787        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2788        if it is not already set, or if it is set
 2789        :type force: bool
 2790        :return: The variant_id column name.
 2791        """
 2792
 2793        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2794
 2795    ###
 2796    # Annotation
 2797    ###
 2798
 2799    def scan_databases(
 2800        self,
 2801        database_formats: list = ["parquet"],
 2802        database_releases: list = ["current"],
 2803    ) -> dict:
 2804        """
 2805        The function `scan_databases` scans for available databases based on specified formats and
 2806        releases.
 2807
 2808        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2809        of the databases to be scanned. In this case, the accepted format is "parquet"
 2810        :type database_formats: list ["parquet"]
 2811        :param database_releases: The `database_releases` parameter is a list that specifies the
 2812        releases of the databases to be scanned. In the provided function, the default value for
 2813        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2814        databases that are in the "current"
 2815        :type database_releases: list
 2816        :return: The function `scan_databases` returns a dictionary containing information about
 2817        databases that match the specified formats and releases.
 2818        """
 2819
 2820        # Config
 2821        config = self.get_config()
 2822
 2823        # Param
 2824        param = self.get_param()
 2825
 2826        # Param - Assembly
 2827        assembly = param.get("assembly", config.get("assembly", None))
 2828        if not assembly:
 2829            assembly = DEFAULT_ASSEMBLY
 2830            log.warning(f"Default assembly '{assembly}'")
 2831
 2832        # Scan for availabled databases
 2833        log.info(
 2834            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2835        )
 2836        databases_infos_dict = databases_infos(
 2837            database_folder_releases=database_releases,
 2838            database_formats=database_formats,
 2839            assembly=assembly,
 2840            config=config,
 2841        )
 2842        log.info(
 2843            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2844        )
 2845
 2846        return databases_infos_dict
 2847
 2848    def annotation(self) -> None:
 2849        """
 2850        It annotates the VCF file with the annotations specified in the config file.
 2851        """
 2852
 2853        # Config
 2854        config = self.get_config()
 2855
 2856        # Param
 2857        param = self.get_param()
 2858
 2859        # Param - Assembly
 2860        assembly = param.get("assembly", config.get("assembly", None))
 2861        if not assembly:
 2862            assembly = DEFAULT_ASSEMBLY
 2863            log.warning(f"Default assembly '{assembly}'")
 2864
 2865        # annotations databases folders
 2866        annotations_databases = set(
 2867            config.get("folders", {})
 2868            .get("databases", {})
 2869            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2870            + config.get("folders", {})
 2871            .get("databases", {})
 2872            .get("parquet", ["~/howard/databases/parquet/current"])
 2873            + config.get("folders", {})
 2874            .get("databases", {})
 2875            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2876        )
 2877
 2878        # Get param annotations
 2879        if param.get("annotations", None) and isinstance(
 2880            param.get("annotations", None), str
 2881        ):
 2882            log.debug(param.get("annotations", None))
 2883            param_annotation_list = param.get("annotations").split(",")
 2884        else:
 2885            param_annotation_list = []
 2886
 2887        # Each tools param
 2888        if param.get("annotation_parquet", None) != None:
 2889            log.debug(
 2890                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2891            )
 2892            if isinstance(param.get("annotation_parquet", None), list):
 2893                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2894            else:
 2895                param_annotation_list.append(param.get("annotation_parquet"))
 2896        if param.get("annotation_snpsift", None) != None:
 2897            if isinstance(param.get("annotation_snpsift", None), list):
 2898                param_annotation_list.append(
 2899                    "snpsift:"
 2900                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2901                )
 2902            else:
 2903                param_annotation_list.append(
 2904                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2905                )
 2906        if param.get("annotation_snpeff", None) != None:
 2907            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2908        if param.get("annotation_bcftools", None) != None:
 2909            if isinstance(param.get("annotation_bcftools", None), list):
 2910                param_annotation_list.append(
 2911                    "bcftools:"
 2912                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2913                )
 2914            else:
 2915                param_annotation_list.append(
 2916                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2917                )
 2918        if param.get("annotation_annovar", None) != None:
 2919            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2920        if param.get("annotation_exomiser", None) != None:
 2921            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2922        if param.get("annotation_splice", None) != None:
 2923            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2924
 2925        # Merge param annotations list
 2926        param["annotations"] = ",".join(param_annotation_list)
 2927
 2928        # debug
 2929        log.debug(f"param_annotations={param['annotations']}")
 2930
 2931        if param.get("annotations"):
 2932
 2933            # Log
 2934            # log.info("Annotations - Check annotation parameters")
 2935
 2936            if not "annotation" in param:
 2937                param["annotation"] = {}
 2938
 2939            # List of annotations parameters
 2940            annotations_list_input = {}
 2941            if isinstance(param.get("annotations", None), str):
 2942                annotation_file_list = [
 2943                    value for value in param.get("annotations", "").split(",")
 2944                ]
 2945                for annotation_file in annotation_file_list:
 2946                    annotations_list_input[annotation_file] = {"INFO": None}
 2947            else:
 2948                annotations_list_input = param.get("annotations", {})
 2949
 2950            log.info(f"Quick Annotations:")
 2951            for annotation_key in list(annotations_list_input.keys()):
 2952                log.info(f"   {annotation_key}")
 2953
 2954            # List of annotations and associated fields
 2955            annotations_list = {}
 2956
 2957            for annotation_file in annotations_list_input:
 2958
 2959                # Explode annotations if ALL
 2960                if (
 2961                    annotation_file.upper() == "ALL"
 2962                    or annotation_file.upper().startswith("ALL:")
 2963                ):
 2964
 2965                    # check ALL parameters (formats, releases)
 2966                    annotation_file_split = annotation_file.split(":")
 2967                    database_formats = "parquet"
 2968                    database_releases = "current"
 2969                    for annotation_file_option in annotation_file_split[1:]:
 2970                        database_all_options_split = annotation_file_option.split("=")
 2971                        if database_all_options_split[0] == "format":
 2972                            database_formats = database_all_options_split[1].split("+")
 2973                        if database_all_options_split[0] == "release":
 2974                            database_releases = database_all_options_split[1].split("+")
 2975
 2976                    # Scan for availabled databases
 2977                    databases_infos_dict = self.scan_databases(
 2978                        database_formats=database_formats,
 2979                        database_releases=database_releases,
 2980                    )
 2981
 2982                    # Add found databases in annotation parameters
 2983                    for database_infos in databases_infos_dict.keys():
 2984                        annotations_list[database_infos] = {"INFO": None}
 2985
 2986                else:
 2987                    annotations_list[annotation_file] = annotations_list_input[
 2988                        annotation_file
 2989                    ]
 2990
 2991            # Check each databases
 2992            if len(annotations_list):
 2993
 2994                log.info(
 2995                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2996                )
 2997
 2998                for annotation_file in annotations_list:
 2999
 3000                    # Init
 3001                    annotations = annotations_list.get(annotation_file, None)
 3002
 3003                    # Annotation snpEff
 3004                    if annotation_file.startswith("snpeff"):
 3005
 3006                        log.debug(f"Quick Annotation snpEff")
 3007
 3008                        if "snpeff" not in param["annotation"]:
 3009                            param["annotation"]["snpeff"] = {}
 3010
 3011                        if "options" not in param["annotation"]["snpeff"]:
 3012                            param["annotation"]["snpeff"]["options"] = ""
 3013
 3014                        # snpEff options in annotations
 3015                        param["annotation"]["snpeff"]["options"] = "".join(
 3016                            annotation_file.split(":")[1:]
 3017                        )
 3018
 3019                    # Annotation Annovar
 3020                    elif annotation_file.startswith("annovar"):
 3021
 3022                        log.debug(f"Quick Annotation Annovar")
 3023
 3024                        if "annovar" not in param["annotation"]:
 3025                            param["annotation"]["annovar"] = {}
 3026
 3027                        if "annotations" not in param["annotation"]["annovar"]:
 3028                            param["annotation"]["annovar"]["annotations"] = {}
 3029
 3030                        # Options
 3031                        annotation_file_split = annotation_file.split(":")
 3032                        for annotation_file_annotation in annotation_file_split[1:]:
 3033                            if annotation_file_annotation:
 3034                                param["annotation"]["annovar"]["annotations"][
 3035                                    annotation_file_annotation
 3036                                ] = annotations
 3037
 3038                    # Annotation Exomiser
 3039                    elif annotation_file.startswith("exomiser"):
 3040
 3041                        log.debug(f"Quick Annotation Exomiser")
 3042
 3043                        param["annotation"]["exomiser"] = params_string_to_dict(
 3044                            annotation_file
 3045                        )
 3046
 3047                    # Annotation Splice
 3048                    elif annotation_file.startswith("splice"):
 3049
 3050                        log.debug(f"Quick Annotation Splice")
 3051
 3052                        param["annotation"]["splice"] = params_string_to_dict(
 3053                            annotation_file
 3054                        )
 3055
 3056                    # Annotation Parquet or BCFTOOLS
 3057                    else:
 3058
 3059                        # Tools detection
 3060                        if annotation_file.startswith("bcftools:"):
 3061                            annotation_tool_initial = "bcftools"
 3062                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3063                        elif annotation_file.startswith("snpsift:"):
 3064                            annotation_tool_initial = "snpsift"
 3065                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3066                        else:
 3067                            annotation_tool_initial = None
 3068
 3069                        # list of files
 3070                        annotation_file_list = annotation_file.replace("+", ":").split(
 3071                            ":"
 3072                        )
 3073
 3074                        for annotation_file in annotation_file_list:
 3075
 3076                            if annotation_file:
 3077
 3078                                # Annotation tool initial
 3079                                annotation_tool = annotation_tool_initial
 3080
 3081                                # Find file
 3082                                annotation_file_found = None
 3083
 3084                                # Expand user
 3085                                annotation_file = full_path(annotation_file)
 3086
 3087                                if os.path.exists(annotation_file):
 3088                                    annotation_file_found = annotation_file
 3089
 3090                                else:
 3091                                    # Find within assembly folders
 3092                                    for annotations_database in annotations_databases:
 3093                                        found_files = find_all(
 3094                                            annotation_file,
 3095                                            os.path.join(
 3096                                                annotations_database, assembly
 3097                                            ),
 3098                                        )
 3099                                        if len(found_files) > 0:
 3100                                            annotation_file_found = found_files[0]
 3101                                            break
 3102                                    if not annotation_file_found and not assembly:
 3103                                        # Find within folders
 3104                                        for (
 3105                                            annotations_database
 3106                                        ) in annotations_databases:
 3107                                            found_files = find_all(
 3108                                                annotation_file, annotations_database
 3109                                            )
 3110                                            if len(found_files) > 0:
 3111                                                annotation_file_found = found_files[0]
 3112                                                break
 3113                                log.debug(
 3114                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3115                                )
 3116
 3117                                # Full path
 3118                                annotation_file_found = full_path(annotation_file_found)
 3119
 3120                                if annotation_file_found:
 3121
 3122                                    database = Database(database=annotation_file_found)
 3123                                    quick_annotation_format = database.get_format()
 3124                                    quick_annotation_is_compressed = (
 3125                                        database.is_compressed()
 3126                                    )
 3127                                    quick_annotation_is_indexed = os.path.exists(
 3128                                        f"{annotation_file_found}.tbi"
 3129                                    )
 3130                                    bcftools_preference = False
 3131
 3132                                    # Check Annotation Tool
 3133                                    if not annotation_tool:
 3134                                        if (
 3135                                            bcftools_preference
 3136                                            and quick_annotation_format
 3137                                            in ["vcf", "bed"]
 3138                                            and quick_annotation_is_compressed
 3139                                            and quick_annotation_is_indexed
 3140                                        ):
 3141                                            annotation_tool = "bcftools"
 3142                                        elif quick_annotation_format in [
 3143                                            "vcf",
 3144                                            "bed",
 3145                                            "tsv",
 3146                                            "tsv",
 3147                                            "csv",
 3148                                            "json",
 3149                                            "tbl",
 3150                                            "parquet",
 3151                                            "duckdb",
 3152                                        ]:
 3153                                            annotation_tool = "parquet"
 3154                                        else:
 3155                                            log.error(
 3156                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3157                                            )
 3158                                            raise ValueError(
 3159                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3160                                            )
 3161
 3162                                    log.debug(
 3163                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3164                                    )
 3165
 3166                                    # Annotation Tool dispatch
 3167                                    if annotation_tool:
 3168                                        if annotation_tool not in param["annotation"]:
 3169                                            param["annotation"][annotation_tool] = {}
 3170                                        if (
 3171                                            "annotations"
 3172                                            not in param["annotation"][annotation_tool]
 3173                                        ):
 3174                                            param["annotation"][annotation_tool][
 3175                                                "annotations"
 3176                                            ] = {}
 3177                                        param["annotation"][annotation_tool][
 3178                                            "annotations"
 3179                                        ][annotation_file_found] = annotations
 3180
 3181                                else:
 3182                                    log.error(
 3183                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3184                                    )
 3185
 3186                self.set_param(param)
 3187
 3188        if param.get("annotation", None):
 3189            log.info("Annotations")
 3190            if param.get("annotation", {}).get("parquet", None):
 3191                log.info("Annotations 'parquet'...")
 3192                self.annotation_parquet()
 3193            if param.get("annotation", {}).get("bcftools", None):
 3194                log.info("Annotations 'bcftools'...")
 3195                self.annotation_bcftools()
 3196            if param.get("annotation", {}).get("snpsift", None):
 3197                log.info("Annotations 'snpsift'...")
 3198                self.annotation_snpsift()
 3199            if param.get("annotation", {}).get("annovar", None):
 3200                log.info("Annotations 'annovar'...")
 3201                self.annotation_annovar()
 3202            if param.get("annotation", {}).get("snpeff", None):
 3203                log.info("Annotations 'snpeff'...")
 3204                self.annotation_snpeff()
 3205            if param.get("annotation", {}).get("exomiser", None) is not None:
 3206                log.info("Annotations 'exomiser'...")
 3207                self.annotation_exomiser()
 3208            if param.get("annotation", {}).get("splice", None) is not None:
 3209                log.info("Annotations 'splice' ...")
 3210                self.annotation_splice()
 3211
 3212        # Explode INFOS fields into table fields
 3213        if self.get_explode_infos():
 3214            self.explode_infos(
 3215                prefix=self.get_explode_infos_prefix(),
 3216                fields=self.get_explode_infos_fields(),
 3217                force=True,
 3218            )
 3219
 3220    def annotation_snpsift(self, threads: int = None) -> None:
 3221        """
 3222        This function annotate with bcftools
 3223
 3224        :param threads: Number of threads to use
 3225        :return: the value of the variable "return_value".
 3226        """
 3227
 3228        # DEBUG
 3229        log.debug("Start annotation with bcftools databases")
 3230
 3231        # Threads
 3232        if not threads:
 3233            threads = self.get_threads()
 3234        log.debug("Threads: " + str(threads))
 3235
 3236        # Config
 3237        config = self.get_config()
 3238        log.debug("Config: " + str(config))
 3239
 3240        # Config - snpSift
 3241        snpsift_bin_command = get_bin_command(
 3242            bin="SnpSift.jar",
 3243            tool="snpsift",
 3244            bin_type="jar",
 3245            config=config,
 3246            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3247        )
 3248        if not snpsift_bin_command:
 3249            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3250            log.error(msg_err)
 3251            raise ValueError(msg_err)
 3252
 3253        # Config - bcftools
 3254        bcftools_bin_command = get_bin_command(
 3255            bin="bcftools",
 3256            tool="bcftools",
 3257            bin_type="bin",
 3258            config=config,
 3259            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3260        )
 3261        if not bcftools_bin_command:
 3262            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3263            log.error(msg_err)
 3264            raise ValueError(msg_err)
 3265
 3266        # Config - BCFTools databases folders
 3267        databases_folders = set(
 3268            self.get_config()
 3269            .get("folders", {})
 3270            .get("databases", {})
 3271            .get("annotations", ["."])
 3272            + self.get_config()
 3273            .get("folders", {})
 3274            .get("databases", {})
 3275            .get("bcftools", ["."])
 3276        )
 3277        log.debug("Databases annotations: " + str(databases_folders))
 3278
 3279        # Param
 3280        annotations = (
 3281            self.get_param()
 3282            .get("annotation", {})
 3283            .get("snpsift", {})
 3284            .get("annotations", None)
 3285        )
 3286        log.debug("Annotations: " + str(annotations))
 3287
 3288        # Assembly
 3289        assembly = self.get_param().get(
 3290            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3291        )
 3292
 3293        # Data
 3294        table_variants = self.get_table_variants()
 3295
 3296        # Check if not empty
 3297        log.debug("Check if not empty")
 3298        sql_query_chromosomes = (
 3299            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3300        )
 3301        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3302        if not sql_query_chromosomes_df["count"][0]:
 3303            log.info(f"VCF empty")
 3304            return
 3305
 3306        # VCF header
 3307        vcf_reader = self.get_header()
 3308        log.debug("Initial header: " + str(vcf_reader.infos))
 3309
 3310        # Existing annotations
 3311        for vcf_annotation in self.get_header().infos:
 3312
 3313            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3314            log.debug(
 3315                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3316            )
 3317
 3318        if annotations:
 3319
 3320            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3321
 3322                # Export VCF file
 3323                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3324
 3325                # Init
 3326                commands = {}
 3327
 3328                for annotation in annotations:
 3329                    annotation_fields = annotations[annotation]
 3330
 3331                    # Annotation Name
 3332                    annotation_name = os.path.basename(annotation)
 3333
 3334                    if not annotation_fields:
 3335                        annotation_fields = {"INFO": None}
 3336
 3337                    log.debug(f"Annotation '{annotation_name}'")
 3338                    log.debug(
 3339                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3340                    )
 3341
 3342                    # Create Database
 3343                    database = Database(
 3344                        database=annotation,
 3345                        databases_folders=databases_folders,
 3346                        assembly=assembly,
 3347                    )
 3348
 3349                    # Find files
 3350                    db_file = database.get_database()
 3351                    db_file = full_path(db_file)
 3352                    db_hdr_file = database.get_header_file()
 3353                    db_hdr_file = full_path(db_hdr_file)
 3354                    db_file_type = database.get_format()
 3355                    db_tbi_file = f"{db_file}.tbi"
 3356                    db_file_compressed = database.is_compressed()
 3357
 3358                    # Check if compressed
 3359                    if not db_file_compressed:
 3360                        log.error(
 3361                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3362                        )
 3363                        raise ValueError(
 3364                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3365                        )
 3366
 3367                    # Check if indexed
 3368                    if not os.path.exists(db_tbi_file):
 3369                        log.error(
 3370                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3371                        )
 3372                        raise ValueError(
 3373                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3374                        )
 3375
 3376                    # Check index - try to create if not exists
 3377                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3378                        log.error("Annotation failed: database not valid")
 3379                        log.error(f"Annotation annotation file: {db_file}")
 3380                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3381                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3382                        raise ValueError(
 3383                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3384                        )
 3385                    else:
 3386
 3387                        log.debug(
 3388                            f"Annotation '{annotation}' - file: "
 3389                            + str(db_file)
 3390                            + " and "
 3391                            + str(db_hdr_file)
 3392                        )
 3393
 3394                        # Load header as VCF object
 3395                        db_hdr_vcf = Variants(input=db_hdr_file)
 3396                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3397                        log.debug(
 3398                            "Annotation database header: "
 3399                            + str(db_hdr_vcf_header_infos)
 3400                        )
 3401
 3402                        # For all fields in database
 3403                        annotation_fields_full = False
 3404                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3405                            annotation_fields = {
 3406                                key: key for key in db_hdr_vcf_header_infos
 3407                            }
 3408                            log.debug(
 3409                                "Annotation database header - All annotations added: "
 3410                                + str(annotation_fields)
 3411                            )
 3412                            annotation_fields_full = True
 3413
 3414                        # # Create file for field rename
 3415                        # log.debug("Create file for field rename")
 3416                        # tmp_rename = NamedTemporaryFile(
 3417                        #     prefix=self.get_prefix(),
 3418                        #     dir=self.get_tmp_dir(),
 3419                        #     suffix=".rename",
 3420                        #     delete=False,
 3421                        # )
 3422                        # tmp_rename_name = tmp_rename.name
 3423                        # tmp_files.append(tmp_rename_name)
 3424
 3425                        # Number of fields
 3426                        nb_annotation_field = 0
 3427                        annotation_list = []
 3428                        annotation_infos_rename_list = []
 3429
 3430                        for annotation_field in annotation_fields:
 3431
 3432                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3433                            annotation_fields_new_name = annotation_fields.get(
 3434                                annotation_field, annotation_field
 3435                            )
 3436                            if not annotation_fields_new_name:
 3437                                annotation_fields_new_name = annotation_field
 3438
 3439                            # Check if field is in DB and if field is not elready in input data
 3440                            if (
 3441                                annotation_field in db_hdr_vcf.get_header().infos
 3442                                and annotation_fields_new_name
 3443                                not in self.get_header().infos
 3444                            ):
 3445
 3446                                log.info(
 3447                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3448                                )
 3449
 3450                                # BCFTools annotate param to rename fields
 3451                                if annotation_field != annotation_fields_new_name:
 3452                                    annotation_infos_rename_list.append(
 3453                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3454                                    )
 3455
 3456                                # Add INFO field to header
 3457                                db_hdr_vcf_header_infos_number = (
 3458                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3459                                )
 3460                                db_hdr_vcf_header_infos_type = (
 3461                                    db_hdr_vcf_header_infos[annotation_field].type
 3462                                    or "String"
 3463                                )
 3464                                db_hdr_vcf_header_infos_description = (
 3465                                    db_hdr_vcf_header_infos[annotation_field].desc
 3466                                    or f"{annotation_field} description"
 3467                                )
 3468                                db_hdr_vcf_header_infos_source = (
 3469                                    db_hdr_vcf_header_infos[annotation_field].source
 3470                                    or "unknown"
 3471                                )
 3472                                db_hdr_vcf_header_infos_version = (
 3473                                    db_hdr_vcf_header_infos[annotation_field].version
 3474                                    or "unknown"
 3475                                )
 3476
 3477                                vcf_reader.infos[annotation_fields_new_name] = (
 3478                                    vcf.parser._Info(
 3479                                        annotation_fields_new_name,
 3480                                        db_hdr_vcf_header_infos_number,
 3481                                        db_hdr_vcf_header_infos_type,
 3482                                        db_hdr_vcf_header_infos_description,
 3483                                        db_hdr_vcf_header_infos_source,
 3484                                        db_hdr_vcf_header_infos_version,
 3485                                        self.code_type_map[
 3486                                            db_hdr_vcf_header_infos_type
 3487                                        ],
 3488                                    )
 3489                                )
 3490
 3491                                annotation_list.append(annotation_field)
 3492
 3493                                nb_annotation_field += 1
 3494
 3495                            else:
 3496
 3497                                if (
 3498                                    annotation_field
 3499                                    not in db_hdr_vcf.get_header().infos
 3500                                ):
 3501                                    log.warning(
 3502                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3503                                    )
 3504                                if (
 3505                                    annotation_fields_new_name
 3506                                    in self.get_header().infos
 3507                                ):
 3508                                    log.warning(
 3509                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3510                                    )
 3511
 3512                        log.info(
 3513                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3514                        )
 3515
 3516                        annotation_infos = ",".join(annotation_list)
 3517
 3518                        if annotation_infos != "":
 3519
 3520                            # Annotated VCF (and error file)
 3521                            tmp_annotation_vcf_name = os.path.join(
 3522                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3523                            )
 3524                            tmp_annotation_vcf_name_err = (
 3525                                tmp_annotation_vcf_name + ".err"
 3526                            )
 3527
 3528                            # Add fields to annotate
 3529                            if not annotation_fields_full:
 3530                                annotation_infos_option = f"-info {annotation_infos}"
 3531                            else:
 3532                                annotation_infos_option = ""
 3533
 3534                            # Info fields rename
 3535                            if annotation_infos_rename_list:
 3536                                annotation_infos_rename = " -c " + ",".join(
 3537                                    annotation_infos_rename_list
 3538                                )
 3539                            else:
 3540                                annotation_infos_rename = ""
 3541
 3542                            # Annotate command
 3543                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3544
 3545                            # Add command
 3546                            commands[command_annotate] = tmp_annotation_vcf_name
 3547
 3548                if commands:
 3549
 3550                    # Export VCF file
 3551                    self.export_variant_vcf(
 3552                        vcf_file=tmp_vcf_name,
 3553                        remove_info=True,
 3554                        add_samples=False,
 3555                        index=True,
 3556                    )
 3557                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3558
 3559                    # Num command
 3560                    nb_command = 0
 3561
 3562                    # Annotate
 3563                    for command_annotate in commands:
 3564                        nb_command += 1
 3565                        log.info(
 3566                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3567                        )
 3568                        log.debug(f"command_annotate={command_annotate}")
 3569                        run_parallel_commands([command_annotate], threads)
 3570
 3571                        # Debug
 3572                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3573
 3574                        # Update variants
 3575                        log.info(
 3576                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3577                        )
 3578                        self.update_from_vcf(commands[command_annotate])
 3579
 3580    def annotation_bcftools(self, threads: int = None) -> None:
 3581        """
 3582        This function annotate with bcftools
 3583
 3584        :param threads: Number of threads to use
 3585        :return: the value of the variable "return_value".
 3586        """
 3587
 3588        # DEBUG
 3589        log.debug("Start annotation with bcftools databases")
 3590
 3591        # Threads
 3592        if not threads:
 3593            threads = self.get_threads()
 3594        log.debug("Threads: " + str(threads))
 3595
 3596        # Config
 3597        config = self.get_config()
 3598        log.debug("Config: " + str(config))
 3599
 3600        # DEBUG
 3601        delete_tmp = True
 3602        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3603            delete_tmp = False
 3604            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3605
 3606        # Config - BCFTools bin command
 3607        bcftools_bin_command = get_bin_command(
 3608            bin="bcftools",
 3609            tool="bcftools",
 3610            bin_type="bin",
 3611            config=config,
 3612            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3613        )
 3614        if not bcftools_bin_command:
 3615            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3616            log.error(msg_err)
 3617            raise ValueError(msg_err)
 3618
 3619        # Config - BCFTools databases folders
 3620        databases_folders = set(
 3621            self.get_config()
 3622            .get("folders", {})
 3623            .get("databases", {})
 3624            .get("annotations", ["."])
 3625            + self.get_config()
 3626            .get("folders", {})
 3627            .get("databases", {})
 3628            .get("bcftools", ["."])
 3629        )
 3630        log.debug("Databases annotations: " + str(databases_folders))
 3631
 3632        # Param
 3633        annotations = (
 3634            self.get_param()
 3635            .get("annotation", {})
 3636            .get("bcftools", {})
 3637            .get("annotations", None)
 3638        )
 3639        log.debug("Annotations: " + str(annotations))
 3640
 3641        # Assembly
 3642        assembly = self.get_param().get(
 3643            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3644        )
 3645
 3646        # Data
 3647        table_variants = self.get_table_variants()
 3648
 3649        # Check if not empty
 3650        log.debug("Check if not empty")
 3651        sql_query_chromosomes = (
 3652            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3653        )
 3654        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3655        if not sql_query_chromosomes_df["count"][0]:
 3656            log.info(f"VCF empty")
 3657            return
 3658
 3659        # Export in VCF
 3660        log.debug("Create initial file to annotate")
 3661        tmp_vcf = NamedTemporaryFile(
 3662            prefix=self.get_prefix(),
 3663            dir=self.get_tmp_dir(),
 3664            suffix=".vcf.gz",
 3665            delete=False,
 3666        )
 3667        tmp_vcf_name = tmp_vcf.name
 3668
 3669        # VCF header
 3670        vcf_reader = self.get_header()
 3671        log.debug("Initial header: " + str(vcf_reader.infos))
 3672
 3673        # Existing annotations
 3674        for vcf_annotation in self.get_header().infos:
 3675
 3676            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3677            log.debug(
 3678                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3679            )
 3680
 3681        if annotations:
 3682
 3683            tmp_ann_vcf_list = []
 3684            commands = []
 3685            tmp_files = []
 3686            err_files = []
 3687
 3688            for annotation in annotations:
 3689                annotation_fields = annotations[annotation]
 3690
 3691                # Annotation Name
 3692                annotation_name = os.path.basename(annotation)
 3693
 3694                if not annotation_fields:
 3695                    annotation_fields = {"INFO": None}
 3696
 3697                log.debug(f"Annotation '{annotation_name}'")
 3698                log.debug(
 3699                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3700                )
 3701
 3702                # Create Database
 3703                database = Database(
 3704                    database=annotation,
 3705                    databases_folders=databases_folders,
 3706                    assembly=assembly,
 3707                )
 3708
 3709                # Find files
 3710                db_file = database.get_database()
 3711                db_file = full_path(db_file)
 3712                db_hdr_file = database.get_header_file()
 3713                db_hdr_file = full_path(db_hdr_file)
 3714                db_file_type = database.get_format()
 3715                db_tbi_file = f"{db_file}.tbi"
 3716                db_file_compressed = database.is_compressed()
 3717
 3718                # Check if compressed
 3719                if not db_file_compressed:
 3720                    log.error(
 3721                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3722                    )
 3723                    raise ValueError(
 3724                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3725                    )
 3726
 3727                # Check if indexed
 3728                if not os.path.exists(db_tbi_file):
 3729                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3730                    raise ValueError(
 3731                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3732                    )
 3733
 3734                # Check index - try to create if not exists
 3735                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3736                    log.error("Annotation failed: database not valid")
 3737                    log.error(f"Annotation annotation file: {db_file}")
 3738                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3739                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3740                    raise ValueError(
 3741                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3742                    )
 3743                else:
 3744
 3745                    log.debug(
 3746                        f"Annotation '{annotation}' - file: "
 3747                        + str(db_file)
 3748                        + " and "
 3749                        + str(db_hdr_file)
 3750                    )
 3751
 3752                    # Load header as VCF object
 3753                    db_hdr_vcf = Variants(input=db_hdr_file)
 3754                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3755                    log.debug(
 3756                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3757                    )
 3758
 3759                    # For all fields in database
 3760                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3761                        annotation_fields = {
 3762                            key: key for key in db_hdr_vcf_header_infos
 3763                        }
 3764                        log.debug(
 3765                            "Annotation database header - All annotations added: "
 3766                            + str(annotation_fields)
 3767                        )
 3768
 3769                    # Number of fields
 3770                    nb_annotation_field = 0
 3771                    annotation_list = []
 3772
 3773                    for annotation_field in annotation_fields:
 3774
 3775                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3776                        annotation_fields_new_name = annotation_fields.get(
 3777                            annotation_field, annotation_field
 3778                        )
 3779                        if not annotation_fields_new_name:
 3780                            annotation_fields_new_name = annotation_field
 3781
 3782                        # Check if field is in DB and if field is not elready in input data
 3783                        if (
 3784                            annotation_field in db_hdr_vcf.get_header().infos
 3785                            and annotation_fields_new_name
 3786                            not in self.get_header().infos
 3787                        ):
 3788
 3789                            log.info(
 3790                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3791                            )
 3792
 3793                            # Add INFO field to header
 3794                            db_hdr_vcf_header_infos_number = (
 3795                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3796                            )
 3797                            db_hdr_vcf_header_infos_type = (
 3798                                db_hdr_vcf_header_infos[annotation_field].type
 3799                                or "String"
 3800                            )
 3801                            db_hdr_vcf_header_infos_description = (
 3802                                db_hdr_vcf_header_infos[annotation_field].desc
 3803                                or f"{annotation_field} description"
 3804                            )
 3805                            db_hdr_vcf_header_infos_source = (
 3806                                db_hdr_vcf_header_infos[annotation_field].source
 3807                                or "unknown"
 3808                            )
 3809                            db_hdr_vcf_header_infos_version = (
 3810                                db_hdr_vcf_header_infos[annotation_field].version
 3811                                or "unknown"
 3812                            )
 3813
 3814                            vcf_reader.infos[annotation_fields_new_name] = (
 3815                                vcf.parser._Info(
 3816                                    annotation_fields_new_name,
 3817                                    db_hdr_vcf_header_infos_number,
 3818                                    db_hdr_vcf_header_infos_type,
 3819                                    db_hdr_vcf_header_infos_description,
 3820                                    db_hdr_vcf_header_infos_source,
 3821                                    db_hdr_vcf_header_infos_version,
 3822                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3823                                )
 3824                            )
 3825
 3826                            # annotation_list.append(annotation_field)
 3827                            if annotation_field != annotation_fields_new_name:
 3828                                annotation_list.append(
 3829                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3830                                )
 3831                            else:
 3832                                annotation_list.append(annotation_field)
 3833
 3834                            nb_annotation_field += 1
 3835
 3836                        else:
 3837
 3838                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3839                                log.warning(
 3840                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3841                                )
 3842                            if annotation_fields_new_name in self.get_header().infos:
 3843                                log.warning(
 3844                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3845                                )
 3846
 3847                    log.info(
 3848                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3849                    )
 3850
 3851                    annotation_infos = ",".join(annotation_list)
 3852
 3853                    if annotation_infos != "":
 3854
 3855                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3856                        log.debug("Protect Header file - remove #CHROM line if exists")
 3857                        tmp_header_vcf = NamedTemporaryFile(
 3858                            prefix=self.get_prefix(),
 3859                            dir=self.get_tmp_dir(),
 3860                            suffix=".hdr",
 3861                            delete=False,
 3862                        )
 3863                        tmp_header_vcf_name = tmp_header_vcf.name
 3864                        tmp_files.append(tmp_header_vcf_name)
 3865                        # Command
 3866                        if db_hdr_file.endswith(".gz"):
 3867                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3868                        else:
 3869                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3870                        # Run
 3871                        run_parallel_commands([command_extract_header], 1)
 3872
 3873                        # Find chomosomes
 3874                        log.debug("Find chromosomes ")
 3875                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3876                        sql_query_chromosomes_df = self.get_query_to_df(
 3877                            sql_query_chromosomes
 3878                        )
 3879                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3880
 3881                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3882
 3883                        # BED columns in the annotation file
 3884                        if db_file_type in ["bed"]:
 3885                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3886
 3887                        for chrom in chomosomes_list:
 3888
 3889                            # Create BED on initial VCF
 3890                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3891                            tmp_bed = NamedTemporaryFile(
 3892                                prefix=self.get_prefix(),
 3893                                dir=self.get_tmp_dir(),
 3894                                suffix=".bed",
 3895                                delete=False,
 3896                            )
 3897                            tmp_bed_name = tmp_bed.name
 3898                            tmp_files.append(tmp_bed_name)
 3899
 3900                            # Detecte regions
 3901                            log.debug(
 3902                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3903                            )
 3904                            window = 1000000
 3905                            sql_query_intervals_for_bed = f"""
 3906                                SELECT  \"#CHROM\",
 3907                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3908                                        \"POS\"+{window}
 3909                                FROM {table_variants} as table_variants
 3910                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3911                            """
 3912                            regions = self.conn.execute(
 3913                                sql_query_intervals_for_bed
 3914                            ).fetchall()
 3915                            merged_regions = merge_regions(regions)
 3916                            log.debug(
 3917                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3918                            )
 3919
 3920                            header = ["#CHROM", "START", "END"]
 3921                            with open(tmp_bed_name, "w") as f:
 3922                                # Write the header with tab delimiter
 3923                                f.write("\t".join(header) + "\n")
 3924                                for d in merged_regions:
 3925                                    # Write each data row with tab delimiter
 3926                                    f.write("\t".join(map(str, d)) + "\n")
 3927
 3928                            # Tmp files
 3929                            tmp_annotation_vcf = NamedTemporaryFile(
 3930                                prefix=self.get_prefix(),
 3931                                dir=self.get_tmp_dir(),
 3932                                suffix=".vcf.gz",
 3933                                delete=False,
 3934                            )
 3935                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3936                            tmp_files.append(tmp_annotation_vcf_name)
 3937                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3938                            tmp_annotation_vcf_name_err = (
 3939                                tmp_annotation_vcf_name + ".err"
 3940                            )
 3941                            err_files.append(tmp_annotation_vcf_name_err)
 3942
 3943                            # Annotate Command
 3944                            log.debug(
 3945                                f"Annotation '{annotation}' - add bcftools command"
 3946                            )
 3947
 3948                            # Command
 3949                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3950
 3951                            # Add command
 3952                            commands.append(command_annotate)
 3953
 3954            # if some commands
 3955            if commands:
 3956
 3957                # Export VCF file
 3958                self.export_variant_vcf(
 3959                    vcf_file=tmp_vcf_name,
 3960                    remove_info=True,
 3961                    add_samples=False,
 3962                    index=True,
 3963                )
 3964
 3965                # Threads
 3966                # calculate threads for annotated commands
 3967                if commands:
 3968                    threads_bcftools_annotate = round(threads / len(commands))
 3969                else:
 3970                    threads_bcftools_annotate = 1
 3971
 3972                if not threads_bcftools_annotate:
 3973                    threads_bcftools_annotate = 1
 3974
 3975                # Add threads option to bcftools commands
 3976                if threads_bcftools_annotate > 1:
 3977                    commands_threaded = []
 3978                    for command in commands:
 3979                        commands_threaded.append(
 3980                            command.replace(
 3981                                f"{bcftools_bin_command} annotate ",
 3982                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3983                            )
 3984                        )
 3985                    commands = commands_threaded
 3986
 3987                # Command annotation multithreading
 3988                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3989                log.info(
 3990                    f"Annotation - Annotation multithreaded in "
 3991                    + str(len(commands))
 3992                    + " commands"
 3993                )
 3994
 3995                run_parallel_commands(commands, threads)
 3996
 3997                # Merge
 3998                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 3999
 4000                if tmp_ann_vcf_list_cmd:
 4001
 4002                    # Tmp file
 4003                    tmp_annotate_vcf = NamedTemporaryFile(
 4004                        prefix=self.get_prefix(),
 4005                        dir=self.get_tmp_dir(),
 4006                        suffix=".vcf.gz",
 4007                        delete=True,
 4008                    )
 4009                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4010                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4011                    err_files.append(tmp_annotate_vcf_name_err)
 4012
 4013                    # Tmp file remove command
 4014                    tmp_files_remove_command = ""
 4015                    if tmp_files:
 4016                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4017
 4018                    # Command merge
 4019                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4020                    log.info(
 4021                        f"Annotation - Annotation merging "
 4022                        + str(len(commands))
 4023                        + " annotated files"
 4024                    )
 4025                    log.debug(f"Annotation - merge command: {merge_command}")
 4026                    run_parallel_commands([merge_command], 1)
 4027
 4028                    # Error messages
 4029                    log.info(f"Error/Warning messages:")
 4030                    error_message_command_all = []
 4031                    error_message_command_warning = []
 4032                    error_message_command_err = []
 4033                    for err_file in err_files:
 4034                        with open(err_file, "r") as f:
 4035                            for line in f:
 4036                                message = line.strip()
 4037                                error_message_command_all.append(message)
 4038                                if line.startswith("[W::"):
 4039                                    error_message_command_warning.append(message)
 4040                                if line.startswith("[E::"):
 4041                                    error_message_command_err.append(
 4042                                        f"{err_file}: " + message
 4043                                    )
 4044                    # log info
 4045                    for message in list(
 4046                        set(error_message_command_err + error_message_command_warning)
 4047                    ):
 4048                        log.info(f"   {message}")
 4049                    # debug info
 4050                    for message in list(set(error_message_command_all)):
 4051                        log.debug(f"   {message}")
 4052                    # failed
 4053                    if len(error_message_command_err):
 4054                        log.error("Annotation failed: Error in commands")
 4055                        raise ValueError("Annotation failed: Error in commands")
 4056
 4057                    # Update variants
 4058                    log.info(f"Annotation - Updating...")
 4059                    self.update_from_vcf(tmp_annotate_vcf_name)
 4060
 4061    def annotation_exomiser(self, threads: int = None) -> None:
 4062        """
 4063        This function annotate with Exomiser
 4064
 4065        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4066        - "analysis" (dict/file):
 4067            Full analysis dictionnary parameters (see Exomiser docs).
 4068            Either a dict, or a file in JSON or YAML format.
 4069            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4070            Default : None
 4071        - "preset" (string):
 4072            Analysis preset (available in config folder).
 4073            Used if no full "analysis" is provided.
 4074            Default: "exome"
 4075        - "phenopacket" (dict/file):
 4076            Samples and phenotipic features parameters (see Exomiser docs).
 4077            Either a dict, or a file in JSON or YAML format.
 4078            Default: None
 4079        - "subject" (dict):
 4080            Sample parameters (see Exomiser docs).
 4081            Example:
 4082                "subject":
 4083                    {
 4084                        "id": "ISDBM322017",
 4085                        "sex": "FEMALE"
 4086                    }
 4087            Default: None
 4088        - "sample" (string):
 4089            Sample name to construct "subject" section:
 4090                "subject":
 4091                    {
 4092                        "id": "<sample>",
 4093                        "sex": "UNKNOWN_SEX"
 4094                    }
 4095            Default: None
 4096        - "phenotypicFeatures" (dict)
 4097            Phenotypic features to construct "subject" section.
 4098            Example:
 4099                "phenotypicFeatures":
 4100                    [
 4101                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4102                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4103                    ]
 4104        - "hpo" (list)
 4105            List of HPO ids as phenotypic features.
 4106            Example:
 4107                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4108            Default: []
 4109        - "outputOptions" (dict):
 4110            Output options (see Exomiser docs).
 4111            Default:
 4112                "output_options" =
 4113                    {
 4114                        "outputContributingVariantsOnly": False,
 4115                        "numGenes": 0,
 4116                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4117                    }
 4118        - "transcript_source" (string):
 4119            Transcript source (either "refseq", "ucsc", "ensembl")
 4120            Default: "refseq"
 4121        - "exomiser_to_info" (boolean):
 4122            Add exomiser TSV file columns as INFO fields in VCF.
 4123            Default: False
 4124        - "release" (string):
 4125            Exomise database release.
 4126            If not exists, database release will be downloaded (take a while).
 4127            Default: None (provided by application.properties configuration file)
 4128        - "exomiser_application_properties" (file):
 4129            Exomiser configuration file (see Exomiser docs).
 4130            Useful to automatically download databases (especially for specific genome databases).
 4131
 4132        Notes:
 4133        - If no sample in parameters, first sample in VCF will be chosen
 4134        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4135
 4136        :param threads: The number of threads to use
 4137        :return: None.
 4138        """
 4139
 4140        # DEBUG
 4141        log.debug("Start annotation with Exomiser databases")
 4142
 4143        # Threads
 4144        if not threads:
 4145            threads = self.get_threads()
 4146        log.debug("Threads: " + str(threads))
 4147
 4148        # Config
 4149        config = self.get_config()
 4150        log.debug("Config: " + str(config))
 4151
 4152        # Config - Folders - Databases
 4153        databases_folders = (
 4154            config.get("folders", {})
 4155            .get("databases", {})
 4156            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4157        )
 4158        databases_folders = full_path(databases_folders)
 4159        if not os.path.exists(databases_folders):
 4160            log.error(f"Databases annotations: {databases_folders} NOT found")
 4161        log.debug("Databases annotations: " + str(databases_folders))
 4162
 4163        # Config - Exomiser
 4164        exomiser_bin_command = get_bin_command(
 4165            bin="exomiser-cli*.jar",
 4166            tool="exomiser",
 4167            bin_type="jar",
 4168            config=config,
 4169            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4170        )
 4171        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4172        if not exomiser_bin_command:
 4173            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4174            log.error(msg_err)
 4175            raise ValueError(msg_err)
 4176
 4177        # Param
 4178        param = self.get_param()
 4179        log.debug("Param: " + str(param))
 4180
 4181        # Param - Exomiser
 4182        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4183        log.debug(f"Param Exomiser: {param_exomiser}")
 4184
 4185        # Param - Assembly
 4186        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4187        log.debug("Assembly: " + str(assembly))
 4188
 4189        # Data
 4190        table_variants = self.get_table_variants()
 4191
 4192        # Check if not empty
 4193        log.debug("Check if not empty")
 4194        sql_query_chromosomes = (
 4195            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4196        )
 4197        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4198            log.info(f"VCF empty")
 4199            return False
 4200
 4201        # VCF header
 4202        vcf_reader = self.get_header()
 4203        log.debug("Initial header: " + str(vcf_reader.infos))
 4204
 4205        # Samples
 4206        samples = self.get_header_sample_list()
 4207        if not samples:
 4208            log.error("No Samples in VCF")
 4209            return False
 4210        log.debug(f"Samples: {samples}")
 4211
 4212        # Memory limit
 4213        memory_limit = self.get_memory("8G")
 4214        log.debug(f"memory_limit: {memory_limit}")
 4215
 4216        # Exomiser java options
 4217        exomiser_java_options = (
 4218            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4219        )
 4220        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4221
 4222        # Download Exomiser (if not exists)
 4223        exomiser_release = param_exomiser.get("release", None)
 4224        exomiser_application_properties = param_exomiser.get(
 4225            "exomiser_application_properties", None
 4226        )
 4227        databases_download_exomiser(
 4228            assemblies=[assembly],
 4229            exomiser_folder=databases_folders,
 4230            exomiser_release=exomiser_release,
 4231            exomiser_phenotype_release=exomiser_release,
 4232            exomiser_application_properties=exomiser_application_properties,
 4233        )
 4234
 4235        # Force annotation
 4236        force_update_annotation = True
 4237
 4238        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4239            log.debug("Start annotation Exomiser")
 4240
 4241            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4242
 4243                # tmp_dir = "/tmp/exomiser"
 4244
 4245                ### ANALYSIS ###
 4246                ################
 4247
 4248                # Create analysis.json through analysis dict
 4249                # either analysis in param or by default
 4250                # depending on preset exome/genome)
 4251
 4252                # Init analysis dict
 4253                param_exomiser_analysis_dict = {}
 4254
 4255                # analysis from param
 4256                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4257                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4258
 4259                # If analysis in param -> load anlaysis json
 4260                if param_exomiser_analysis:
 4261
 4262                    # If param analysis is a file and exists
 4263                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4264                        param_exomiser_analysis
 4265                    ):
 4266                        # Load analysis file into analysis dict (either yaml or json)
 4267                        with open(param_exomiser_analysis) as json_file:
 4268                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4269
 4270                    # If param analysis is a dict
 4271                    elif isinstance(param_exomiser_analysis, dict):
 4272                        # Load analysis dict into analysis dict (either yaml or json)
 4273                        param_exomiser_analysis_dict = param_exomiser_analysis
 4274
 4275                    # Error analysis type
 4276                    else:
 4277                        log.error(f"Analysis type unknown. Check param file.")
 4278                        raise ValueError(f"Analysis type unknown. Check param file.")
 4279
 4280                # Case no input analysis config file/dict
 4281                # Use preset (exome/genome) to open default config file
 4282                if not param_exomiser_analysis_dict:
 4283
 4284                    # default preset
 4285                    default_preset = "exome"
 4286
 4287                    # Get param preset or default preset
 4288                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4289
 4290                    # Try to find if preset is a file
 4291                    if os.path.exists(param_exomiser_preset):
 4292                        # Preset file is provided in full path
 4293                        param_exomiser_analysis_default_config_file = (
 4294                            param_exomiser_preset
 4295                        )
 4296                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4297                    #     # Preset file is provided in full path
 4298                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4299                    elif os.path.exists(
 4300                        os.path.join(folder_config, param_exomiser_preset)
 4301                    ):
 4302                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4303                        param_exomiser_analysis_default_config_file = os.path.join(
 4304                            folder_config, param_exomiser_preset
 4305                        )
 4306                    else:
 4307                        # Construct preset file
 4308                        param_exomiser_analysis_default_config_file = os.path.join(
 4309                            folder_config,
 4310                            f"preset-{param_exomiser_preset}-analysis.json",
 4311                        )
 4312
 4313                    # If preset file exists
 4314                    param_exomiser_analysis_default_config_file = full_path(
 4315                        param_exomiser_analysis_default_config_file
 4316                    )
 4317                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4318                        # Load prest file into analysis dict (either yaml or json)
 4319                        with open(
 4320                            param_exomiser_analysis_default_config_file
 4321                        ) as json_file:
 4322                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4323                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4324                                json_file
 4325                            )
 4326
 4327                    # Error preset file
 4328                    else:
 4329                        log.error(
 4330                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4331                        )
 4332                        raise ValueError(
 4333                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4334                        )
 4335
 4336                # If no analysis dict created
 4337                if not param_exomiser_analysis_dict:
 4338                    log.error(f"No analysis config")
 4339                    raise ValueError(f"No analysis config")
 4340
 4341                # Log
 4342                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4343
 4344                ### PHENOPACKET ###
 4345                ###################
 4346
 4347                # If no PhenoPacket in analysis dict -> check in param
 4348                if "phenopacket" not in param_exomiser_analysis_dict:
 4349
 4350                    # If PhenoPacket in param -> load anlaysis json
 4351                    if param_exomiser.get("phenopacket", None):
 4352
 4353                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4354                        param_exomiser_phenopacket = full_path(
 4355                            param_exomiser_phenopacket
 4356                        )
 4357
 4358                        # If param phenopacket is a file and exists
 4359                        if isinstance(
 4360                            param_exomiser_phenopacket, str
 4361                        ) and os.path.exists(param_exomiser_phenopacket):
 4362                            # Load phenopacket file into analysis dict (either yaml or json)
 4363                            with open(param_exomiser_phenopacket) as json_file:
 4364                                param_exomiser_analysis_dict["phenopacket"] = (
 4365                                    yaml.safe_load(json_file)
 4366                                )
 4367
 4368                        # If param phenopacket is a dict
 4369                        elif isinstance(param_exomiser_phenopacket, dict):
 4370                            # Load phenopacket dict into analysis dict (either yaml or json)
 4371                            param_exomiser_analysis_dict["phenopacket"] = (
 4372                                param_exomiser_phenopacket
 4373                            )
 4374
 4375                        # Error phenopacket type
 4376                        else:
 4377                            log.error(f"Phenopacket type unknown. Check param file.")
 4378                            raise ValueError(
 4379                                f"Phenopacket type unknown. Check param file."
 4380                            )
 4381
 4382                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4383                if "phenopacket" not in param_exomiser_analysis_dict:
 4384
 4385                    # Init PhenoPacket
 4386                    param_exomiser_analysis_dict["phenopacket"] = {
 4387                        "id": "analysis",
 4388                        "proband": {},
 4389                    }
 4390
 4391                    ### Add subject ###
 4392
 4393                    # If subject exists
 4394                    param_exomiser_subject = param_exomiser.get("subject", {})
 4395
 4396                    # If subject not exists -> found sample ID
 4397                    if not param_exomiser_subject:
 4398
 4399                        # Found sample ID in param
 4400                        sample = param_exomiser.get("sample", None)
 4401
 4402                        # Find sample ID (first sample)
 4403                        if not sample:
 4404                            sample_list = self.get_header_sample_list()
 4405                            if len(sample_list) > 0:
 4406                                sample = sample_list[0]
 4407                            else:
 4408                                log.error(f"No sample found")
 4409                                raise ValueError(f"No sample found")
 4410
 4411                        # Create subject
 4412                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4413
 4414                    # Add to dict
 4415                    param_exomiser_analysis_dict["phenopacket"][
 4416                        "subject"
 4417                    ] = param_exomiser_subject
 4418
 4419                    ### Add "phenotypicFeatures" ###
 4420
 4421                    # If phenotypicFeatures exists
 4422                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4423                        "phenotypicFeatures", []
 4424                    )
 4425
 4426                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4427                    if not param_exomiser_phenotypicfeatures:
 4428
 4429                        # Found HPO in param
 4430                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4431
 4432                        # Split HPO if list in string format separated by comma
 4433                        if isinstance(param_exomiser_hpo, str):
 4434                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4435
 4436                        # Create HPO list
 4437                        for hpo in param_exomiser_hpo:
 4438                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4439                            param_exomiser_phenotypicfeatures.append(
 4440                                {
 4441                                    "type": {
 4442                                        "id": f"HP:{hpo_clean}",
 4443                                        "label": f"HP:{hpo_clean}",
 4444                                    }
 4445                                }
 4446                            )
 4447
 4448                    # Add to dict
 4449                    param_exomiser_analysis_dict["phenopacket"][
 4450                        "phenotypicFeatures"
 4451                    ] = param_exomiser_phenotypicfeatures
 4452
 4453                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4454                    if not param_exomiser_phenotypicfeatures:
 4455                        for step in param_exomiser_analysis_dict.get(
 4456                            "analysis", {}
 4457                        ).get("steps", []):
 4458                            if "hiPhivePrioritiser" in step:
 4459                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4460                                    "steps", []
 4461                                ).remove(step)
 4462
 4463                ### Add Input File ###
 4464
 4465                # Initial file name and htsFiles
 4466                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4467                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4468                    {
 4469                        "uri": tmp_vcf_name,
 4470                        "htsFormat": "VCF",
 4471                        "genomeAssembly": assembly,
 4472                    }
 4473                ]
 4474
 4475                ### Add metaData ###
 4476
 4477                # If metaData not in analysis dict
 4478                if "metaData" not in param_exomiser_analysis_dict:
 4479                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4480                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4481                        "createdBy": "howard",
 4482                        "phenopacketSchemaVersion": 1,
 4483                    }
 4484
 4485                ### OutputOptions ###
 4486
 4487                # Init output result folder
 4488                output_results = os.path.join(tmp_dir, "results")
 4489
 4490                # If no outputOptions in analysis dict
 4491                if "outputOptions" not in param_exomiser_analysis_dict:
 4492
 4493                    # default output formats
 4494                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4495
 4496                    # Get outputOptions in param
 4497                    output_options = param_exomiser.get("outputOptions", None)
 4498
 4499                    # If no output_options in param -> check
 4500                    if not output_options:
 4501                        output_options = {
 4502                            "outputContributingVariantsOnly": False,
 4503                            "numGenes": 0,
 4504                            "outputFormats": defaut_output_formats,
 4505                        }
 4506
 4507                    # Replace outputDirectory in output options
 4508                    output_options["outputDirectory"] = output_results
 4509                    output_options["outputFileName"] = "howard"
 4510
 4511                    # Add outputOptions in analysis dict
 4512                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4513
 4514                else:
 4515
 4516                    # Replace output_results and output format (if exists in param)
 4517                    param_exomiser_analysis_dict["outputOptions"][
 4518                        "outputDirectory"
 4519                    ] = output_results
 4520                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4521                        list(
 4522                            set(
 4523                                param_exomiser_analysis_dict.get(
 4524                                    "outputOptions", {}
 4525                                ).get("outputFormats", [])
 4526                                + ["TSV_VARIANT", "VCF"]
 4527                            )
 4528                        )
 4529                    )
 4530
 4531                # log
 4532                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4533
 4534                ### ANALYSIS FILE ###
 4535                #####################
 4536
 4537                ### Full JSON analysis config file ###
 4538
 4539                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4540                with open(exomiser_analysis, "w") as fp:
 4541                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4542
 4543                ### SPLIT analysis and sample config files
 4544
 4545                # Splitted analysis dict
 4546                param_exomiser_analysis_dict_for_split = (
 4547                    param_exomiser_analysis_dict.copy()
 4548                )
 4549
 4550                # Phenopacket JSON file
 4551                exomiser_analysis_phenopacket = os.path.join(
 4552                    tmp_dir, "analysis_phenopacket.json"
 4553                )
 4554                with open(exomiser_analysis_phenopacket, "w") as fp:
 4555                    json.dump(
 4556                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4557                        fp,
 4558                        indent=4,
 4559                    )
 4560
 4561                # Analysis JSON file without Phenopacket parameters
 4562                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4563                exomiser_analysis_analysis = os.path.join(
 4564                    tmp_dir, "analysis_analysis.json"
 4565                )
 4566                with open(exomiser_analysis_analysis, "w") as fp:
 4567                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4568
 4569                ### INITAL VCF file ###
 4570                #######################
 4571
 4572                ### Create list of samples to use and include inti initial VCF file ####
 4573
 4574                # Subject (main sample)
 4575                # Get sample ID in analysis dict
 4576                sample_subject = (
 4577                    param_exomiser_analysis_dict.get("phenopacket", {})
 4578                    .get("subject", {})
 4579                    .get("id", None)
 4580                )
 4581                sample_proband = (
 4582                    param_exomiser_analysis_dict.get("phenopacket", {})
 4583                    .get("proband", {})
 4584                    .get("subject", {})
 4585                    .get("id", None)
 4586                )
 4587                sample = []
 4588                if sample_subject:
 4589                    sample.append(sample_subject)
 4590                if sample_proband:
 4591                    sample.append(sample_proband)
 4592
 4593                # Get sample ID within Pedigree
 4594                pedigree_persons_list = (
 4595                    param_exomiser_analysis_dict.get("phenopacket", {})
 4596                    .get("pedigree", {})
 4597                    .get("persons", {})
 4598                )
 4599
 4600                # Create list with all sample ID in pedigree (if exists)
 4601                pedigree_persons = []
 4602                for person in pedigree_persons_list:
 4603                    pedigree_persons.append(person.get("individualId"))
 4604
 4605                # Concat subject sample ID and samples ID in pedigreesamples
 4606                samples = list(set(sample + pedigree_persons))
 4607
 4608                # Check if sample list is not empty
 4609                if not samples:
 4610                    log.error(f"No samples found")
 4611                    raise ValueError(f"No samples found")
 4612
 4613                # Create VCF with sample (either sample in param or first one by default)
 4614                # Export VCF file
 4615                self.export_variant_vcf(
 4616                    vcf_file=tmp_vcf_name,
 4617                    remove_info=True,
 4618                    add_samples=True,
 4619                    list_samples=samples,
 4620                    index=False,
 4621                )
 4622
 4623                ### Execute Exomiser ###
 4624                ########################
 4625
 4626                # Init command
 4627                exomiser_command = ""
 4628
 4629                # Command exomiser options
 4630                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4631
 4632                # Release
 4633                exomiser_release = param_exomiser.get("release", None)
 4634                if exomiser_release:
 4635                    # phenotype data version
 4636                    exomiser_options += (
 4637                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4638                    )
 4639                    # data version
 4640                    exomiser_options += (
 4641                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4642                    )
 4643                    # variant white list
 4644                    variant_white_list_file = (
 4645                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4646                    )
 4647                    if os.path.exists(
 4648                        os.path.join(
 4649                            databases_folders, assembly, variant_white_list_file
 4650                        )
 4651                    ):
 4652                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4653
 4654                # transcript_source
 4655                transcript_source = param_exomiser.get(
 4656                    "transcript_source", None
 4657                )  # ucsc, refseq, ensembl
 4658                if transcript_source:
 4659                    exomiser_options += (
 4660                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4661                    )
 4662
 4663                # If analysis contain proband param
 4664                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4665                    "proband", {}
 4666                ):
 4667                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4668
 4669                # If no proband (usually uniq sample)
 4670                else:
 4671                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4672
 4673                # Log
 4674                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4675
 4676                # Run command
 4677                result = subprocess.call(
 4678                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4679                )
 4680                if result:
 4681                    log.error("Exomiser command failed")
 4682                    raise ValueError("Exomiser command failed")
 4683
 4684                ### RESULTS ###
 4685                ###############
 4686
 4687                ### Annotate with TSV fields ###
 4688
 4689                # Init result tsv file
 4690                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4691
 4692                # Init result tsv file
 4693                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4694
 4695                # Parse TSV file and explode columns in INFO field
 4696                if exomiser_to_info and os.path.exists(output_results_tsv):
 4697
 4698                    # Log
 4699                    log.debug("Exomiser columns to VCF INFO field")
 4700
 4701                    # Retrieve columns and types
 4702                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4703                    output_results_tsv_df = self.get_query_to_df(query)
 4704                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4705
 4706                    # Init concat fields for update
 4707                    sql_query_update_concat_fields = []
 4708
 4709                    # Fields to avoid
 4710                    fields_to_avoid = [
 4711                        "CONTIG",
 4712                        "START",
 4713                        "END",
 4714                        "REF",
 4715                        "ALT",
 4716                        "QUAL",
 4717                        "FILTER",
 4718                        "GENOTYPE",
 4719                    ]
 4720
 4721                    # List all columns to add into header
 4722                    for header_column in output_results_tsv_columns:
 4723
 4724                        # If header column is enable
 4725                        if header_column not in fields_to_avoid:
 4726
 4727                            # Header info type
 4728                            header_info_type = "String"
 4729                            header_column_df = output_results_tsv_df[header_column]
 4730                            header_column_df_dtype = header_column_df.dtype
 4731                            if header_column_df_dtype == object:
 4732                                if (
 4733                                    pd.to_numeric(header_column_df, errors="coerce")
 4734                                    .notnull()
 4735                                    .all()
 4736                                ):
 4737                                    header_info_type = "Float"
 4738                            else:
 4739                                header_info_type = "Integer"
 4740
 4741                            # Header info
 4742                            characters_to_validate = ["-"]
 4743                            pattern = "[" + "".join(characters_to_validate) + "]"
 4744                            header_info_name = re.sub(
 4745                                pattern,
 4746                                "_",
 4747                                f"Exomiser_{header_column}".replace("#", ""),
 4748                            )
 4749                            header_info_number = "."
 4750                            header_info_description = (
 4751                                f"Exomiser {header_column} annotation"
 4752                            )
 4753                            header_info_source = "Exomiser"
 4754                            header_info_version = "unknown"
 4755                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4756                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4757                                header_info_name,
 4758                                header_info_number,
 4759                                header_info_type,
 4760                                header_info_description,
 4761                                header_info_source,
 4762                                header_info_version,
 4763                                header_info_code,
 4764                            )
 4765
 4766                            # Add field to add for update to concat fields
 4767                            sql_query_update_concat_fields.append(
 4768                                f"""
 4769                                CASE
 4770                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4771                                    THEN concat(
 4772                                        '{header_info_name}=',
 4773                                        table_parquet."{header_column}",
 4774                                        ';'
 4775                                        )
 4776
 4777                                    ELSE ''
 4778                                END
 4779                            """
 4780                            )
 4781
 4782                    # Update query
 4783                    sql_query_update = f"""
 4784                        UPDATE {table_variants} as table_variants
 4785                            SET INFO = concat(
 4786                                            CASE
 4787                                                WHEN INFO NOT IN ('', '.')
 4788                                                THEN INFO
 4789                                                ELSE ''
 4790                                            END,
 4791                                            CASE
 4792                                                WHEN table_variants.INFO NOT IN ('','.')
 4793                                                THEN ';'
 4794                                                ELSE ''
 4795                                            END,
 4796                                            (
 4797                                            SELECT 
 4798                                                concat(
 4799                                                    {",".join(sql_query_update_concat_fields)}
 4800                                                )
 4801                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4802                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4803                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4804                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4805                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4806                                            )
 4807                                        )
 4808                            ;
 4809                        """
 4810
 4811                    # Update
 4812                    self.conn.execute(sql_query_update)
 4813
 4814                ### Annotate with VCF INFO field ###
 4815
 4816                # Init result VCF file
 4817                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4818
 4819                # If VCF exists
 4820                if os.path.exists(output_results_vcf):
 4821
 4822                    # Log
 4823                    log.debug("Exomiser result VCF update variants")
 4824
 4825                    # Find Exomiser INFO field annotation in header
 4826                    with gzip.open(output_results_vcf, "rt") as f:
 4827                        header_list = self.read_vcf_header(f)
 4828                    exomiser_vcf_header = vcf.Reader(
 4829                        io.StringIO("\n".join(header_list))
 4830                    )
 4831
 4832                    # Add annotation INFO field to header
 4833                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4834
 4835                    # Update variants with VCF
 4836                    self.update_from_vcf(output_results_vcf)
 4837
 4838        return True
 4839
 4840    def annotation_snpeff(self, threads: int = None) -> None:
 4841        """
 4842        This function annotate with snpEff
 4843
 4844        :param threads: The number of threads to use
 4845        :return: the value of the variable "return_value".
 4846        """
 4847
 4848        # DEBUG
 4849        log.debug("Start annotation with snpeff databases")
 4850
 4851        # Threads
 4852        if not threads:
 4853            threads = self.get_threads()
 4854        log.debug("Threads: " + str(threads))
 4855
 4856        # DEBUG
 4857        delete_tmp = True
 4858        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4859            delete_tmp = False
 4860            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4861
 4862        # Config
 4863        config = self.get_config()
 4864        log.debug("Config: " + str(config))
 4865
 4866        # Config - Folders - Databases
 4867        databases_folders = (
 4868            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4869        )
 4870        log.debug("Databases annotations: " + str(databases_folders))
 4871
 4872        # # Config - Java
 4873        # java_bin = get_bin(
 4874        #     tool="java",
 4875        #     bin="java",
 4876        #     bin_type="bin",
 4877        #     config=config,
 4878        #     default_folder="/usr/bin",
 4879        # )
 4880        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4881        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4882        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4883
 4884        # # Config - snpEff bin
 4885        # snpeff_jar = get_bin(
 4886        #     tool="snpeff",
 4887        #     bin="snpEff.jar",
 4888        #     bin_type="jar",
 4889        #     config=config,
 4890        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4891        # )
 4892        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4893        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4894        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4895
 4896        # Config - snpEff bin command
 4897        snpeff_bin_command = get_bin_command(
 4898            bin="snpEff.jar",
 4899            tool="snpeff",
 4900            bin_type="jar",
 4901            config=config,
 4902            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4903        )
 4904        if not snpeff_bin_command:
 4905            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4906            log.error(msg_err)
 4907            raise ValueError(msg_err)
 4908
 4909        # Config - snpEff databases
 4910        snpeff_databases = (
 4911            config.get("folders", {})
 4912            .get("databases", {})
 4913            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4914        )
 4915        snpeff_databases = full_path(snpeff_databases)
 4916        if snpeff_databases is not None and snpeff_databases != "":
 4917            log.debug(f"Create snpEff databases folder")
 4918            if not os.path.exists(snpeff_databases):
 4919                os.makedirs(snpeff_databases)
 4920
 4921        # Param
 4922        param = self.get_param()
 4923        log.debug("Param: " + str(param))
 4924
 4925        # Param
 4926        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4927        log.debug("Options: " + str(options))
 4928
 4929        # Param - Assembly
 4930        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4931
 4932        # Param - Options
 4933        snpeff_options = (
 4934            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4935        )
 4936        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4937        snpeff_csvstats = (
 4938            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4939        )
 4940        if snpeff_stats:
 4941            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4942            snpeff_stats = full_path(snpeff_stats)
 4943            snpeff_options += f" -stats {snpeff_stats}"
 4944        if snpeff_csvstats:
 4945            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4946            snpeff_csvstats = full_path(snpeff_csvstats)
 4947            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4948
 4949        # Data
 4950        table_variants = self.get_table_variants()
 4951
 4952        # Check if not empty
 4953        log.debug("Check if not empty")
 4954        sql_query_chromosomes = (
 4955            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4956        )
 4957        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4958        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4959            log.info(f"VCF empty")
 4960            return
 4961
 4962        # Export in VCF
 4963        log.debug("Create initial file to annotate")
 4964        tmp_vcf = NamedTemporaryFile(
 4965            prefix=self.get_prefix(),
 4966            dir=self.get_tmp_dir(),
 4967            suffix=".vcf.gz",
 4968            delete=True,
 4969        )
 4970        tmp_vcf_name = tmp_vcf.name
 4971
 4972        # VCF header
 4973        vcf_reader = self.get_header()
 4974        log.debug("Initial header: " + str(vcf_reader.infos))
 4975
 4976        # Existing annotations
 4977        for vcf_annotation in self.get_header().infos:
 4978
 4979            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4980            log.debug(
 4981                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4982            )
 4983
 4984        # Memory limit
 4985        # if config.get("memory", None):
 4986        #     memory_limit = config.get("memory", "8G")
 4987        # else:
 4988        #     memory_limit = "8G"
 4989        memory_limit = self.get_memory("8G")
 4990        log.debug(f"memory_limit: {memory_limit}")
 4991
 4992        # snpEff java options
 4993        snpeff_java_options = (
 4994            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4995        )
 4996        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4997
 4998        force_update_annotation = True
 4999
 5000        if "ANN" not in self.get_header().infos or force_update_annotation:
 5001
 5002            # Check snpEff database
 5003            log.debug(f"Check snpEff databases {[assembly]}")
 5004            databases_download_snpeff(
 5005                folder=snpeff_databases, assemblies=[assembly], config=config
 5006            )
 5007
 5008            # Export VCF file
 5009            self.export_variant_vcf(
 5010                vcf_file=tmp_vcf_name,
 5011                remove_info=True,
 5012                add_samples=False,
 5013                index=True,
 5014            )
 5015
 5016            # Tmp file
 5017            err_files = []
 5018            tmp_annotate_vcf = NamedTemporaryFile(
 5019                prefix=self.get_prefix(),
 5020                dir=self.get_tmp_dir(),
 5021                suffix=".vcf",
 5022                delete=False,
 5023            )
 5024            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5025            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5026            err_files.append(tmp_annotate_vcf_name_err)
 5027
 5028            # Command
 5029            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5030            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5031            run_parallel_commands([snpeff_command], 1)
 5032
 5033            # Error messages
 5034            log.info(f"Error/Warning messages:")
 5035            error_message_command_all = []
 5036            error_message_command_warning = []
 5037            error_message_command_err = []
 5038            for err_file in err_files:
 5039                with open(err_file, "r") as f:
 5040                    for line in f:
 5041                        message = line.strip()
 5042                        error_message_command_all.append(message)
 5043                        if line.startswith("[W::"):
 5044                            error_message_command_warning.append(message)
 5045                        if line.startswith("[E::"):
 5046                            error_message_command_err.append(f"{err_file}: " + message)
 5047            # log info
 5048            for message in list(
 5049                set(error_message_command_err + error_message_command_warning)
 5050            ):
 5051                log.info(f"   {message}")
 5052            # debug info
 5053            for message in list(set(error_message_command_all)):
 5054                log.debug(f"   {message}")
 5055            # failed
 5056            if len(error_message_command_err):
 5057                log.error("Annotation failed: Error in commands")
 5058                raise ValueError("Annotation failed: Error in commands")
 5059
 5060            # Find annotation in header
 5061            with open(tmp_annotate_vcf_name, "rt") as f:
 5062                header_list = self.read_vcf_header(f)
 5063            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5064
 5065            for ann in annovar_vcf_header.infos:
 5066                if ann not in self.get_header().infos:
 5067                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5068
 5069            # Update variants
 5070            log.info(f"Annotation - Updating...")
 5071            self.update_from_vcf(tmp_annotate_vcf_name)
 5072
 5073        else:
 5074            if "ANN" in self.get_header().infos:
 5075                log.debug(f"Existing snpEff annotations in VCF")
 5076            if force_update_annotation:
 5077                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5078
 5079    def annotation_annovar(self, threads: int = None) -> None:
 5080        """
 5081        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5082        annotations
 5083
 5084        :param threads: number of threads to use
 5085        :return: the value of the variable "return_value".
 5086        """
 5087
 5088        # DEBUG
 5089        log.debug("Start annotation with Annovar databases")
 5090
 5091        # Threads
 5092        if not threads:
 5093            threads = self.get_threads()
 5094        log.debug("Threads: " + str(threads))
 5095
 5096        # Tmp en Err files
 5097        tmp_files = []
 5098        err_files = []
 5099
 5100        # DEBUG
 5101        delete_tmp = True
 5102        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5103            delete_tmp = False
 5104            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5105
 5106        # Config
 5107        config = self.get_config()
 5108        log.debug("Config: " + str(config))
 5109
 5110        # Config - Folders - Databases
 5111        databases_folders = (
 5112            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5113        )
 5114        log.debug("Databases annotations: " + str(databases_folders))
 5115
 5116        # Config - annovar bin command
 5117        annovar_bin_command = get_bin_command(
 5118            bin="table_annovar.pl",
 5119            tool="annovar",
 5120            bin_type="perl",
 5121            config=config,
 5122            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5123        )
 5124        if not annovar_bin_command:
 5125            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5126            log.error(msg_err)
 5127            raise ValueError(msg_err)
 5128
 5129        # Config - BCFTools bin command
 5130        bcftools_bin_command = get_bin_command(
 5131            bin="bcftools",
 5132            tool="bcftools",
 5133            bin_type="bin",
 5134            config=config,
 5135            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5136        )
 5137        if not bcftools_bin_command:
 5138            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5139            log.error(msg_err)
 5140            raise ValueError(msg_err)
 5141
 5142        # Config - annovar databases
 5143        annovar_databases = (
 5144            config.get("folders", {})
 5145            .get("databases", {})
 5146            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5147        )
 5148        annovar_databases = full_path(annovar_databases)
 5149        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5150            os.makedirs(annovar_databases)
 5151
 5152        # Param
 5153        param = self.get_param()
 5154        log.debug("Param: " + str(param))
 5155
 5156        # Param - options
 5157        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5158        log.debug("Options: " + str(options))
 5159
 5160        # Param - annotations
 5161        annotations = (
 5162            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5163        )
 5164        log.debug("Annotations: " + str(annotations))
 5165
 5166        # Param - Assembly
 5167        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5168
 5169        # Annovar database assembly
 5170        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5171        if annovar_databases_assembly != "" and not os.path.exists(
 5172            annovar_databases_assembly
 5173        ):
 5174            os.makedirs(annovar_databases_assembly)
 5175
 5176        # Data
 5177        table_variants = self.get_table_variants()
 5178
 5179        # Check if not empty
 5180        log.debug("Check if not empty")
 5181        sql_query_chromosomes = (
 5182            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5183        )
 5184        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5185        if not sql_query_chromosomes_df["count"][0]:
 5186            log.info(f"VCF empty")
 5187            return
 5188
 5189        # VCF header
 5190        vcf_reader = self.get_header()
 5191        log.debug("Initial header: " + str(vcf_reader.infos))
 5192
 5193        # Existing annotations
 5194        for vcf_annotation in self.get_header().infos:
 5195
 5196            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5197            log.debug(
 5198                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5199            )
 5200
 5201        force_update_annotation = True
 5202
 5203        if annotations:
 5204
 5205            commands = []
 5206            tmp_annotates_vcf_name_list = []
 5207
 5208            # Export in VCF
 5209            log.debug("Create initial file to annotate")
 5210            tmp_vcf = NamedTemporaryFile(
 5211                prefix=self.get_prefix(),
 5212                dir=self.get_tmp_dir(),
 5213                suffix=".vcf.gz",
 5214                delete=False,
 5215            )
 5216            tmp_vcf_name = tmp_vcf.name
 5217            tmp_files.append(tmp_vcf_name)
 5218            tmp_files.append(tmp_vcf_name + ".tbi")
 5219
 5220            # Export VCF file
 5221            self.export_variant_vcf(
 5222                vcf_file=tmp_vcf_name,
 5223                remove_info=".",
 5224                add_samples=False,
 5225                index=True,
 5226            )
 5227
 5228            # Create file for field rename
 5229            log.debug("Create file for field rename")
 5230            tmp_rename = NamedTemporaryFile(
 5231                prefix=self.get_prefix(),
 5232                dir=self.get_tmp_dir(),
 5233                suffix=".rename",
 5234                delete=False,
 5235            )
 5236            tmp_rename_name = tmp_rename.name
 5237            tmp_files.append(tmp_rename_name)
 5238
 5239            # Check Annovar database
 5240            log.debug(
 5241                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5242            )
 5243            databases_download_annovar(
 5244                folder=annovar_databases,
 5245                files=list(annotations.keys()),
 5246                assemblies=[assembly],
 5247            )
 5248
 5249            for annotation in annotations:
 5250                annotation_fields = annotations[annotation]
 5251
 5252                if not annotation_fields:
 5253                    annotation_fields = {"INFO": None}
 5254
 5255                log.info(f"Annotations Annovar - database '{annotation}'")
 5256                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5257
 5258                # Tmp file for annovar
 5259                err_files = []
 5260                tmp_annotate_vcf_directory = TemporaryDirectory(
 5261                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5262                )
 5263                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5264                tmp_annotate_vcf_name_annovar = (
 5265                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5266                )
 5267                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5268                err_files.append(tmp_annotate_vcf_name_err)
 5269                tmp_files.append(tmp_annotate_vcf_name_err)
 5270
 5271                # Tmp file final vcf annotated by annovar
 5272                tmp_annotate_vcf = NamedTemporaryFile(
 5273                    prefix=self.get_prefix(),
 5274                    dir=self.get_tmp_dir(),
 5275                    suffix=".vcf.gz",
 5276                    delete=False,
 5277                )
 5278                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5279                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5280                tmp_files.append(tmp_annotate_vcf_name)
 5281                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5282
 5283                # Number of fields
 5284                annotation_list = []
 5285                annotation_renamed_list = []
 5286
 5287                for annotation_field in annotation_fields:
 5288
 5289                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5290                    annotation_fields_new_name = annotation_fields.get(
 5291                        annotation_field, annotation_field
 5292                    )
 5293                    if not annotation_fields_new_name:
 5294                        annotation_fields_new_name = annotation_field
 5295
 5296                    if (
 5297                        force_update_annotation
 5298                        or annotation_fields_new_name not in self.get_header().infos
 5299                    ):
 5300                        annotation_list.append(annotation_field)
 5301                        annotation_renamed_list.append(annotation_fields_new_name)
 5302                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5303                        log.warning(
 5304                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5305                        )
 5306
 5307                    # Add rename info
 5308                    run_parallel_commands(
 5309                        [
 5310                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5311                        ],
 5312                        1,
 5313                    )
 5314
 5315                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5316                log.debug("annotation_list: " + str(annotation_list))
 5317
 5318                # protocol
 5319                protocol = annotation
 5320
 5321                # argument
 5322                argument = ""
 5323
 5324                # operation
 5325                operation = "f"
 5326                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5327                    "ensGene"
 5328                ):
 5329                    operation = "g"
 5330                    if options.get("genebase", None):
 5331                        argument = f"""'{options.get("genebase","")}'"""
 5332                elif annotation in ["cytoBand"]:
 5333                    operation = "r"
 5334
 5335                # argument option
 5336                argument_option = ""
 5337                if argument != "":
 5338                    argument_option = " --argument " + argument
 5339
 5340                # command options
 5341                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5342                for option in options:
 5343                    if option not in ["genebase"]:
 5344                        command_options += f""" --{option}={options[option]}"""
 5345
 5346                # Command
 5347
 5348                # Command - Annovar
 5349                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5350                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5351
 5352                # Command - start pipe
 5353                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5354
 5355                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5356                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5357
 5358                # Command - Special characters (refGene annotation)
 5359                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5360
 5361                # Command - Clean empty fields (with value ".")
 5362                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5363
 5364                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5365                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5366                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5367                    # for ann in annotation_renamed_list:
 5368                    for ann in annotation_list:
 5369                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5370
 5371                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5372
 5373                # Command - indexing
 5374                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5375
 5376                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5377                run_parallel_commands([command_annovar], 1)
 5378
 5379                # Error messages
 5380                log.info(f"Error/Warning messages:")
 5381                error_message_command_all = []
 5382                error_message_command_warning = []
 5383                error_message_command_err = []
 5384                for err_file in err_files:
 5385                    with open(err_file, "r") as f:
 5386                        for line in f:
 5387                            message = line.strip()
 5388                            error_message_command_all.append(message)
 5389                            if line.startswith("[W::") or line.startswith("WARNING"):
 5390                                error_message_command_warning.append(message)
 5391                            if line.startswith("[E::") or line.startswith("ERROR"):
 5392                                error_message_command_err.append(
 5393                                    f"{err_file}: " + message
 5394                                )
 5395                # log info
 5396                for message in list(
 5397                    set(error_message_command_err + error_message_command_warning)
 5398                ):
 5399                    log.info(f"   {message}")
 5400                # debug info
 5401                for message in list(set(error_message_command_all)):
 5402                    log.debug(f"   {message}")
 5403                # failed
 5404                if len(error_message_command_err):
 5405                    log.error("Annotation failed: Error in commands")
 5406                    raise ValueError("Annotation failed: Error in commands")
 5407
 5408            if tmp_annotates_vcf_name_list:
 5409
 5410                # List of annotated files
 5411                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5412
 5413                # Tmp file
 5414                tmp_annotate_vcf = NamedTemporaryFile(
 5415                    prefix=self.get_prefix(),
 5416                    dir=self.get_tmp_dir(),
 5417                    suffix=".vcf.gz",
 5418                    delete=False,
 5419                )
 5420                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5421                tmp_files.append(tmp_annotate_vcf_name)
 5422                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5423                err_files.append(tmp_annotate_vcf_name_err)
 5424                tmp_files.append(tmp_annotate_vcf_name_err)
 5425
 5426                # Command merge
 5427                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5428                log.info(
 5429                    f"Annotation Annovar - Annotation merging "
 5430                    + str(len(tmp_annotates_vcf_name_list))
 5431                    + " annotated files"
 5432                )
 5433                log.debug(f"Annotation - merge command: {merge_command}")
 5434                run_parallel_commands([merge_command], 1)
 5435
 5436                # Find annotation in header
 5437                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5438                    header_list = self.read_vcf_header(f)
 5439                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5440
 5441                for ann in annovar_vcf_header.infos:
 5442                    if ann not in self.get_header().infos:
 5443                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5444
 5445                # Update variants
 5446                log.info(f"Annotation Annovar - Updating...")
 5447                self.update_from_vcf(tmp_annotate_vcf_name)
 5448
 5449            # Clean files
 5450            # Tmp file remove command
 5451            if True:
 5452                tmp_files_remove_command = ""
 5453                if tmp_files:
 5454                    tmp_files_remove_command = " ".join(tmp_files)
 5455                clean_command = f" rm -f {tmp_files_remove_command} "
 5456                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5457                log.debug(f"Annotation - cleaning command: {clean_command}")
 5458                run_parallel_commands([clean_command], 1)
 5459
 5460    # Parquet
 5461    def annotation_parquet(self, threads: int = None) -> None:
 5462        """
 5463        It takes a VCF file, and annotates it with a parquet file
 5464
 5465        :param threads: number of threads to use for the annotation
 5466        :return: the value of the variable "result".
 5467        """
 5468
 5469        # DEBUG
 5470        log.debug("Start annotation with parquet databases")
 5471
 5472        # Threads
 5473        if not threads:
 5474            threads = self.get_threads()
 5475        log.debug("Threads: " + str(threads))
 5476
 5477        # DEBUG
 5478        delete_tmp = True
 5479        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5480            delete_tmp = False
 5481            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5482
 5483        # Config
 5484        databases_folders = set(
 5485            self.get_config()
 5486            .get("folders", {})
 5487            .get("databases", {})
 5488            .get("annotations", ["."])
 5489            + self.get_config()
 5490            .get("folders", {})
 5491            .get("databases", {})
 5492            .get("parquet", ["."])
 5493        )
 5494        log.debug("Databases annotations: " + str(databases_folders))
 5495
 5496        # Param
 5497        annotations = (
 5498            self.get_param()
 5499            .get("annotation", {})
 5500            .get("parquet", {})
 5501            .get("annotations", None)
 5502        )
 5503        log.debug("Annotations: " + str(annotations))
 5504
 5505        # Assembly
 5506        assembly = self.get_param().get(
 5507            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5508        )
 5509
 5510        # Force Update Annotation
 5511        force_update_annotation = (
 5512            self.get_param()
 5513            .get("annotation", {})
 5514            .get("options", {})
 5515            .get("annotations_update", False)
 5516        )
 5517        log.debug(f"force_update_annotation={force_update_annotation}")
 5518        force_append_annotation = (
 5519            self.get_param()
 5520            .get("annotation", {})
 5521            .get("options", {})
 5522            .get("annotations_append", False)
 5523        )
 5524        log.debug(f"force_append_annotation={force_append_annotation}")
 5525
 5526        # Data
 5527        table_variants = self.get_table_variants()
 5528
 5529        # Check if not empty
 5530        log.debug("Check if not empty")
 5531        sql_query_chromosomes_df = self.get_query_to_df(
 5532            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5533        )
 5534        if not sql_query_chromosomes_df["count"][0]:
 5535            log.info(f"VCF empty")
 5536            return
 5537
 5538        # VCF header
 5539        vcf_reader = self.get_header()
 5540        log.debug("Initial header: " + str(vcf_reader.infos))
 5541
 5542        # Nb Variants POS
 5543        log.debug("NB Variants Start")
 5544        nb_variants = self.conn.execute(
 5545            f"SELECT count(*) AS count FROM variants"
 5546        ).fetchdf()["count"][0]
 5547        log.debug("NB Variants Stop")
 5548
 5549        # Existing annotations
 5550        for vcf_annotation in self.get_header().infos:
 5551
 5552            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5553            log.debug(
 5554                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5555            )
 5556
 5557        # Added columns
 5558        added_columns = []
 5559
 5560        # drop indexes
 5561        log.debug(f"Drop indexes...")
 5562        self.drop_indexes()
 5563
 5564        if annotations:
 5565
 5566            if "ALL" in annotations:
 5567
 5568                all_param = annotations.get("ALL", {})
 5569                all_param_formats = all_param.get("formats", None)
 5570                all_param_releases = all_param.get("releases", None)
 5571
 5572                databases_infos_dict = self.scan_databases(
 5573                    database_formats=all_param_formats,
 5574                    database_releases=all_param_releases,
 5575                )
 5576                for database_infos in databases_infos_dict.keys():
 5577                    if database_infos not in annotations:
 5578                        annotations[database_infos] = {"INFO": None}
 5579
 5580            for annotation in annotations:
 5581
 5582                if annotation in ["ALL"]:
 5583                    continue
 5584
 5585                # Annotation Name
 5586                annotation_name = os.path.basename(annotation)
 5587
 5588                # Annotation fields
 5589                annotation_fields = annotations[annotation]
 5590                if not annotation_fields:
 5591                    annotation_fields = {"INFO": None}
 5592
 5593                log.debug(f"Annotation '{annotation_name}'")
 5594                log.debug(
 5595                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5596                )
 5597
 5598                # Create Database
 5599                database = Database(
 5600                    database=annotation,
 5601                    databases_folders=databases_folders,
 5602                    assembly=assembly,
 5603                )
 5604
 5605                # Find files
 5606                parquet_file = database.get_database()
 5607                parquet_hdr_file = database.get_header_file()
 5608                parquet_type = database.get_type()
 5609
 5610                # Check if files exists
 5611                if not parquet_file or not parquet_hdr_file:
 5612                    log.error("Annotation failed: file not found")
 5613                    raise ValueError("Annotation failed: file not found")
 5614                else:
 5615                    # Get parquet connexion
 5616                    parquet_sql_attach = database.get_sql_database_attach(
 5617                        output="query"
 5618                    )
 5619                    if parquet_sql_attach:
 5620                        self.conn.execute(parquet_sql_attach)
 5621                    parquet_file_link = database.get_sql_database_link()
 5622                    # Log
 5623                    log.debug(
 5624                        f"Annotation '{annotation_name}' - file: "
 5625                        + str(parquet_file)
 5626                        + " and "
 5627                        + str(parquet_hdr_file)
 5628                    )
 5629
 5630                    # Database full header columns
 5631                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5632                        parquet_hdr_file
 5633                    )
 5634                    # Log
 5635                    log.debug(
 5636                        "Annotation database header columns : "
 5637                        + str(parquet_hdr_vcf_header_columns)
 5638                    )
 5639
 5640                    # Load header as VCF object
 5641                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5642                    # Log
 5643                    log.debug(
 5644                        "Annotation database header: "
 5645                        + str(parquet_hdr_vcf_header_infos)
 5646                    )
 5647
 5648                    # Get extra infos
 5649                    parquet_columns = database.get_extra_columns()
 5650                    # Log
 5651                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5652
 5653                    # Add extra columns if "ALL" in annotation_fields
 5654                    # if "ALL" in annotation_fields:
 5655                    #     allow_add_extra_column = True
 5656                    if "ALL" in annotation_fields and database.get_extra_columns():
 5657                        for extra_column in database.get_extra_columns():
 5658                            if (
 5659                                extra_column not in annotation_fields
 5660                                and extra_column.replace("INFO/", "")
 5661                                not in parquet_hdr_vcf_header_infos
 5662                            ):
 5663                                parquet_hdr_vcf_header_infos[extra_column] = (
 5664                                    vcf.parser._Info(
 5665                                        extra_column,
 5666                                        ".",
 5667                                        "String",
 5668                                        f"{extra_column} description",
 5669                                        "unknown",
 5670                                        "unknown",
 5671                                        self.code_type_map["String"],
 5672                                    )
 5673                                )
 5674
 5675                    # For all fields in database
 5676                    annotation_fields_all = False
 5677                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5678                        annotation_fields_all = True
 5679                        annotation_fields = {
 5680                            key: key for key in parquet_hdr_vcf_header_infos
 5681                        }
 5682
 5683                        log.debug(
 5684                            "Annotation database header - All annotations added: "
 5685                            + str(annotation_fields)
 5686                        )
 5687
 5688                    # Init
 5689
 5690                    # List of annotation fields to use
 5691                    sql_query_annotation_update_info_sets = []
 5692
 5693                    # List of annotation to agregate
 5694                    sql_query_annotation_to_agregate = []
 5695
 5696                    # Number of fields
 5697                    nb_annotation_field = 0
 5698
 5699                    # Annotation fields processed
 5700                    annotation_fields_processed = []
 5701
 5702                    # Columns mapping
 5703                    map_columns = database.map_columns(
 5704                        columns=annotation_fields, prefixes=["INFO/"]
 5705                    )
 5706
 5707                    # Query dict for fields to remove (update option)
 5708                    query_dict_remove = {}
 5709
 5710                    # Fetch Anotation fields
 5711                    for annotation_field in annotation_fields:
 5712
 5713                        # annotation_field_column
 5714                        annotation_field_column = map_columns.get(
 5715                            annotation_field, "INFO"
 5716                        )
 5717
 5718                        # field new name, if parametered
 5719                        annotation_fields_new_name = annotation_fields.get(
 5720                            annotation_field, annotation_field
 5721                        )
 5722                        if not annotation_fields_new_name:
 5723                            annotation_fields_new_name = annotation_field
 5724
 5725                        # To annotate
 5726                        # force_update_annotation = True
 5727                        # force_append_annotation = True
 5728                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5729                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5730                            force_update_annotation
 5731                            or force_append_annotation
 5732                            or (
 5733                                annotation_fields_new_name
 5734                                not in self.get_header().infos
 5735                            )
 5736                        ):
 5737
 5738                            # Add field to annotation to process list
 5739                            annotation_fields_processed.append(
 5740                                annotation_fields_new_name
 5741                            )
 5742
 5743                            # explode infos for the field
 5744                            annotation_fields_new_name_info_msg = ""
 5745                            if (
 5746                                force_update_annotation
 5747                                and annotation_fields_new_name
 5748                                in self.get_header().infos
 5749                            ):
 5750                                # Remove field from INFO
 5751                                query = f"""
 5752                                    UPDATE {table_variants} as table_variants
 5753                                    SET INFO = REGEXP_REPLACE(
 5754                                                concat(table_variants.INFO,''),
 5755                                                ';*{annotation_fields_new_name}=[^;]*',
 5756                                                ''
 5757                                                )
 5758                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5759                                """
 5760                                annotation_fields_new_name_info_msg = " [update]"
 5761                                query_dict_remove[
 5762                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5763                                ] = query
 5764
 5765                            # Sep between fields in INFO
 5766                            nb_annotation_field += 1
 5767                            if nb_annotation_field > 1:
 5768                                annotation_field_sep = ";"
 5769                            else:
 5770                                annotation_field_sep = ""
 5771
 5772                            log.info(
 5773                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5774                            )
 5775
 5776                            # Add INFO field to header
 5777                            parquet_hdr_vcf_header_infos_number = (
 5778                                parquet_hdr_vcf_header_infos[annotation_field].num
 5779                                or "."
 5780                            )
 5781                            parquet_hdr_vcf_header_infos_type = (
 5782                                parquet_hdr_vcf_header_infos[annotation_field].type
 5783                                or "String"
 5784                            )
 5785                            parquet_hdr_vcf_header_infos_description = (
 5786                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5787                                or f"{annotation_field} description"
 5788                            )
 5789                            parquet_hdr_vcf_header_infos_source = (
 5790                                parquet_hdr_vcf_header_infos[annotation_field].source
 5791                                or "unknown"
 5792                            )
 5793                            parquet_hdr_vcf_header_infos_version = (
 5794                                parquet_hdr_vcf_header_infos[annotation_field].version
 5795                                or "unknown"
 5796                            )
 5797
 5798                            vcf_reader.infos[annotation_fields_new_name] = (
 5799                                vcf.parser._Info(
 5800                                    annotation_fields_new_name,
 5801                                    parquet_hdr_vcf_header_infos_number,
 5802                                    parquet_hdr_vcf_header_infos_type,
 5803                                    parquet_hdr_vcf_header_infos_description,
 5804                                    parquet_hdr_vcf_header_infos_source,
 5805                                    parquet_hdr_vcf_header_infos_version,
 5806                                    self.code_type_map[
 5807                                        parquet_hdr_vcf_header_infos_type
 5808                                    ],
 5809                                )
 5810                            )
 5811
 5812                            # Append
 5813                            if force_append_annotation:
 5814                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5815                            else:
 5816                                query_case_when_append = ""
 5817
 5818                            # Annotation/Update query fields
 5819                            # Found in INFO column
 5820                            if (
 5821                                annotation_field_column == "INFO"
 5822                                and "INFO" in parquet_hdr_vcf_header_columns
 5823                            ):
 5824                                sql_query_annotation_update_info_sets.append(
 5825                                    f"""
 5826                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5827                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5828                                        ELSE ''
 5829                                    END
 5830                                """
 5831                                )
 5832                            # Found in a specific column
 5833                            else:
 5834                                # sql_query_annotation_update_info_sets.append(
 5835                                #     f"""
 5836                                # CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5837                                #         THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
 5838                                #         ELSE ''
 5839                                #     END
 5840                                # """
 5841                                # )
 5842                                sql_query_annotation_update_info_sets.append(
 5843                                    f"""
 5844                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5845                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 5846                                        ELSE ''
 5847                                    END
 5848                                """
 5849                                )
 5850                                sql_query_annotation_to_agregate.append(
 5851                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5852                                )
 5853
 5854                        # Not to annotate
 5855                        else:
 5856
 5857                            if force_update_annotation:
 5858                                annotation_message = "forced"
 5859                            else:
 5860                                annotation_message = "skipped"
 5861
 5862                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5863                                log.warning(
 5864                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5865                                )
 5866                            if annotation_fields_new_name in self.get_header().infos:
 5867                                log.warning(
 5868                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5869                                )
 5870
 5871                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5872                    # allow_annotation_full_info = True
 5873                    allow_annotation_full_info = not force_append_annotation
 5874
 5875                    if parquet_type in ["regions"]:
 5876                        allow_annotation_full_info = False
 5877
 5878                    if (
 5879                        allow_annotation_full_info
 5880                        and nb_annotation_field == len(annotation_fields)
 5881                        and annotation_fields_all
 5882                        and (
 5883                            "INFO" in parquet_hdr_vcf_header_columns
 5884                            and "INFO" in database.get_extra_columns()
 5885                        )
 5886                    ):
 5887                        log.debug("Column INFO annotation enabled")
 5888                        sql_query_annotation_update_info_sets = []
 5889                        sql_query_annotation_update_info_sets.append(
 5890                            f" table_parquet.INFO "
 5891                        )
 5892
 5893                    if sql_query_annotation_update_info_sets:
 5894
 5895                        # Annotate
 5896                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5897
 5898                        # Join query annotation update info sets for SQL
 5899                        sql_query_annotation_update_info_sets_sql = ",".join(
 5900                            sql_query_annotation_update_info_sets
 5901                        )
 5902
 5903                        # Check chromosomes list (and variants infos)
 5904                        sql_query_chromosomes = f"""
 5905                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5906                            FROM {table_variants} as table_variants
 5907                            GROUP BY table_variants."#CHROM"
 5908                            ORDER BY table_variants."#CHROM"
 5909                            """
 5910                        sql_query_chromosomes_df = self.conn.execute(
 5911                            sql_query_chromosomes
 5912                        ).df()
 5913                        sql_query_chromosomes_dict = {
 5914                            entry["CHROM"]: {
 5915                                "count": entry["count_variants"],
 5916                                "min": entry["min_variants"],
 5917                                "max": entry["max_variants"],
 5918                            }
 5919                            for index, entry in sql_query_chromosomes_df.iterrows()
 5920                        }
 5921
 5922                        # Init
 5923                        nb_of_query = 0
 5924                        nb_of_variant_annotated = 0
 5925                        query_dict = query_dict_remove
 5926
 5927                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5928                        for chrom in sql_query_chromosomes_dict:
 5929
 5930                            # Number of variant by chromosome
 5931                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5932                                chrom, {}
 5933                            ).get("count", 0)
 5934
 5935                            log.debug(
 5936                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5937                            )
 5938
 5939                            # Annotation with regions database
 5940                            if parquet_type in ["regions"]:
 5941                                sql_query_annotation_from_clause = f"""
 5942                                    FROM (
 5943                                        SELECT 
 5944                                            '{chrom}' AS \"#CHROM\",
 5945                                            table_variants_from.\"POS\" AS \"POS\",
 5946                                            {",".join(sql_query_annotation_to_agregate)}
 5947                                        FROM {table_variants} as table_variants_from
 5948                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5949                                            table_parquet_from."#CHROM" = '{chrom}'
 5950                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5951                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5952                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5953                                                )
 5954                                        )
 5955                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5956                                        GROUP BY table_variants_from.\"POS\"
 5957                                        )
 5958                                        as table_parquet
 5959                                """
 5960
 5961                                sql_query_annotation_where_clause = """
 5962                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5963                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5964                                """
 5965
 5966                            # Annotation with variants database
 5967                            else:
 5968                                sql_query_annotation_from_clause = f"""
 5969                                    FROM {parquet_file_link} as table_parquet
 5970                                """
 5971                                sql_query_annotation_where_clause = f"""
 5972                                    table_variants."#CHROM" = '{chrom}'
 5973                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5974                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5975                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5976                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5977                                """
 5978
 5979                            # Create update query
 5980                            sql_query_annotation_chrom_interval_pos = f"""
 5981                                UPDATE {table_variants} as table_variants
 5982                                    SET INFO = 
 5983                                        concat(
 5984                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5985                                                THEN table_variants.INFO
 5986                                                ELSE ''
 5987                                            END
 5988                                            ,
 5989                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5990                                                        AND (
 5991                                                        concat({sql_query_annotation_update_info_sets_sql})
 5992                                                        )
 5993                                                        NOT IN ('','.') 
 5994                                                    THEN ';'
 5995                                                    ELSE ''
 5996                                            END
 5997                                            ,
 5998                                            {sql_query_annotation_update_info_sets_sql}
 5999                                            )
 6000                                    {sql_query_annotation_from_clause}
 6001                                    WHERE {sql_query_annotation_where_clause}
 6002                                    ;
 6003                                """
 6004
 6005                            # Add update query to dict
 6006                            query_dict[
 6007                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6008                            ] = sql_query_annotation_chrom_interval_pos
 6009
 6010                        nb_of_query = len(query_dict)
 6011                        num_query = 0
 6012
 6013                        # SET max_expression_depth TO x
 6014                        self.conn.execute("SET max_expression_depth TO 10000")
 6015
 6016                        for query_name in query_dict:
 6017                            query = query_dict[query_name]
 6018                            num_query += 1
 6019                            log.info(
 6020                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6021                            )
 6022                            result = self.conn.execute(query)
 6023                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6024                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6025                            log.info(
 6026                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6027                            )
 6028
 6029                        log.info(
 6030                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6031                        )
 6032
 6033                    else:
 6034
 6035                        log.info(
 6036                            f"Annotation '{annotation_name}' - No Annotations available"
 6037                        )
 6038
 6039                    log.debug("Final header: " + str(vcf_reader.infos))
 6040
 6041        # Remove added columns
 6042        for added_column in added_columns:
 6043            self.drop_column(column=added_column)
 6044
 6045    def annotation_splice(self, threads: int = None) -> None:
 6046        """
 6047        This function annotate with snpEff
 6048
 6049        :param threads: The number of threads to use
 6050        :return: the value of the variable "return_value".
 6051        """
 6052
 6053        # DEBUG
 6054        log.debug("Start annotation with splice tools")
 6055
 6056        # Threads
 6057        if not threads:
 6058            threads = self.get_threads()
 6059        log.debug("Threads: " + str(threads))
 6060
 6061        # DEBUG
 6062        delete_tmp = True
 6063        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6064            delete_tmp = False
 6065            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6066
 6067        # Config
 6068        config = self.get_config()
 6069        log.debug("Config: " + str(config))
 6070        splice_config = config.get("tools", {}).get("splice", {})
 6071        if not splice_config:
 6072            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6073        if not splice_config:
 6074            msg_err = "No Splice tool config"
 6075            log.error(msg_err)
 6076            raise ValueError(msg_err)
 6077        log.debug(f"splice_config={splice_config}")
 6078
 6079        # Config - Folders - Databases
 6080        databases_folders = (
 6081            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6082        )
 6083        log.debug("Databases annotations: " + str(databases_folders))
 6084
 6085        # Splice docker image
 6086        splice_docker_image = splice_config.get("docker").get("image")
 6087
 6088        # Pull splice image if it's not already there
 6089        if not check_docker_image_exists(splice_docker_image):
 6090            log.warning(
 6091                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6092            )
 6093            try:
 6094                command(f"docker pull {splice_config.get('docker').get('image')}")
 6095            except subprocess.CalledProcessError:
 6096                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6097                log.error(msg_err)
 6098                raise ValueError(msg_err)
 6099                return None
 6100
 6101        # Config - splice databases
 6102        splice_databases = (
 6103            config.get("folders", {})
 6104            .get("databases", {})
 6105            .get("splice", DEFAULT_SPLICE_FOLDER)
 6106        )
 6107        splice_databases = full_path(splice_databases)
 6108
 6109        # Param
 6110        param = self.get_param()
 6111        log.debug("Param: " + str(param))
 6112
 6113        # Param
 6114        options = param.get("annotation", {}).get("splice", {})
 6115        log.debug("Options: " + str(options))
 6116
 6117        # Data
 6118        table_variants = self.get_table_variants()
 6119
 6120        # Check if not empty
 6121        log.debug("Check if not empty")
 6122        sql_query_chromosomes = (
 6123            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6124        )
 6125        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6126            log.info("VCF empty")
 6127            return None
 6128
 6129        # Export in VCF
 6130        log.debug("Create initial file to annotate")
 6131
 6132        # Create output folder
 6133        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6134        if not os.path.exists(output_folder):
 6135            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6136
 6137        # Create tmp VCF file
 6138        tmp_vcf = NamedTemporaryFile(
 6139            prefix=self.get_prefix(),
 6140            dir=output_folder,
 6141            suffix=".vcf",
 6142            delete=False,
 6143        )
 6144        tmp_vcf_name = tmp_vcf.name
 6145
 6146        # VCF header
 6147        header = self.get_header()
 6148
 6149        # Existing annotations
 6150        for vcf_annotation in self.get_header().infos:
 6151
 6152            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6153            log.debug(
 6154                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6155            )
 6156
 6157        # Memory limit
 6158        if config.get("memory", None):
 6159            memory_limit = config.get("memory", "8G").upper()
 6160            # upper()
 6161        else:
 6162            memory_limit = "8G"
 6163        log.debug(f"memory_limit: {memory_limit}")
 6164
 6165        # Check number of variants to annotate
 6166        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6167        where_clause_regex_spip = r"SPiP_\w+"
 6168        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6169        df_list_of_variants_to_annotate = self.get_query_to_df(
 6170            query=f""" SELECT * FROM variants {where_clause} """
 6171        )
 6172        if len(df_list_of_variants_to_annotate) == 0:
 6173            log.warning(
 6174                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6175            )
 6176            return None
 6177        else:
 6178            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6179
 6180        # Export VCF file
 6181        self.export_variant_vcf(
 6182            vcf_file=tmp_vcf_name,
 6183            remove_info=True,
 6184            add_samples=True,
 6185            index=False,
 6186            where_clause=where_clause,
 6187        )
 6188
 6189        # Create docker container and launch splice analysis
 6190        if splice_config:
 6191
 6192            # Splice mount folders
 6193            mount_folders = splice_config.get("mount", {})
 6194
 6195            # Genome mount
 6196            mount_folders[
 6197                config.get("folders", {})
 6198                .get("databases", {})
 6199                .get("genomes", DEFAULT_GENOME_FOLDER)
 6200            ] = "ro"
 6201
 6202            # SpliceAI mount
 6203            mount_folders[
 6204                config.get("folders", {})
 6205                .get("databases", {})
 6206                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6207            ] = "ro"
 6208
 6209            # Genome mount
 6210            mount_folders[
 6211                config.get("folders", {})
 6212                .get("databases", {})
 6213                .get("spip", DEFAULT_SPIP_FOLDER)
 6214            ] = "ro"
 6215
 6216            # Mount folders
 6217            mount = []
 6218
 6219            # Config mount
 6220            mount = [
 6221                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6222                for path, mode in mount_folders.items()
 6223            ]
 6224
 6225            if any(value for value in splice_config.values() if value is None):
 6226                log.warning("At least one splice config parameter is empty")
 6227                return None
 6228
 6229            # Params in splice nf
 6230            def check_values(dico: dict):
 6231                """
 6232                Ensure parameters for NF splice pipeline
 6233                """
 6234                for key, val in dico.items():
 6235                    if key == "genome":
 6236                        if any(
 6237                            assemb in options.get("genome", {})
 6238                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6239                        ):
 6240                            yield f"--{key} hg19"
 6241                        elif any(
 6242                            assemb in options.get("genome", {})
 6243                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6244                        ):
 6245                            yield f"--{key} hg38"
 6246                    elif (
 6247                        (isinstance(val, str) and val)
 6248                        or isinstance(val, int)
 6249                        or isinstance(val, bool)
 6250                    ):
 6251                        yield f"--{key} {val}"
 6252
 6253            # Genome
 6254            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6255            options["genome"] = genome
 6256
 6257            # NF params
 6258            nf_params = []
 6259
 6260            # Add options
 6261            if options:
 6262                nf_params = list(check_values(options))
 6263                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6264            else:
 6265                log.debug("No NF params provided")
 6266
 6267            # Add threads
 6268            if "threads" not in options.keys():
 6269                nf_params.append(f"--threads {threads}")
 6270
 6271            # Genome path
 6272            genome_path = find_genome(
 6273                config.get("folders", {})
 6274                .get("databases", {})
 6275                .get("genomes", DEFAULT_GENOME_FOLDER),
 6276                file=f"{genome}.fa",
 6277            )
 6278            # Add genome path
 6279            if not genome_path:
 6280                raise ValueError(
 6281                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6282                )
 6283            else:
 6284                log.debug(f"Genome: {genome_path}")
 6285                nf_params.append(f"--genome_path {genome_path}")
 6286
 6287            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6288                """
 6289                Setting up updated databases for SPiP and SpliceAI
 6290                """
 6291
 6292                try:
 6293
 6294                    # SpliceAI assembly transcriptome
 6295                    spliceai_assembly = os.path.join(
 6296                        config.get("folders", {})
 6297                        .get("databases", {})
 6298                        .get("spliceai", {}),
 6299                        options.get("genome"),
 6300                        "transcriptome",
 6301                    )
 6302                    spip_assembly = options.get("genome")
 6303
 6304                    spip = find(
 6305                        f"transcriptome_{spip_assembly}.RData",
 6306                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6307                    )
 6308                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6309                    log.debug(f"SPiP annotations: {spip}")
 6310                    log.debug(f"SpliceAI annotations: {spliceai}")
 6311                    if spip and spliceai:
 6312                        return [
 6313                            f"--spip_transcriptome {spip}",
 6314                            f"--spliceai_annotations {spliceai}",
 6315                        ]
 6316                    else:
 6317                        # TODO crash and go on with basic annotations ?
 6318                        # raise ValueError(
 6319                        #     "Can't find splice databases in configuration EXIT"
 6320                        # )
 6321                        log.warning(
 6322                            "Can't find splice databases in configuration, use annotations file from image"
 6323                        )
 6324                except TypeError:
 6325                    log.warning(
 6326                        "Can't find splice databases in configuration, use annotations file from image"
 6327                    )
 6328                    return []
 6329
 6330            # Add options, check if transcriptome option have already beend provided
 6331            if (
 6332                "spip_transcriptome" not in nf_params
 6333                and "spliceai_transcriptome" not in nf_params
 6334            ):
 6335                splice_reference = splice_annotations(options, config)
 6336                if splice_reference:
 6337                    nf_params.extend(splice_reference)
 6338
 6339            nf_params.append(f"--output_folder {output_folder}")
 6340
 6341            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6342            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6343            log.debug(cmd)
 6344
 6345            splice_config["docker"]["command"] = cmd
 6346
 6347            docker_cmd = get_bin_command(
 6348                tool="splice",
 6349                bin_type="docker",
 6350                config=config,
 6351                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6352                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6353            )
 6354
 6355            # Docker debug
 6356            # if splice_config.get("rm_container"):
 6357            #     rm_container = "--rm"
 6358            # else:
 6359            #     rm_container = ""
 6360            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6361
 6362            log.debug(docker_cmd)
 6363            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6364            log.debug(res.stdout)
 6365            if res.stderr:
 6366                log.error(res.stderr)
 6367            res.check_returncode()
 6368        else:
 6369            log.warning(f"Splice tool configuration not found: {config}")
 6370
 6371        # Update variants
 6372        log.info("Annotation - Updating...")
 6373        # Test find output vcf
 6374        log.debug(
 6375            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6376        )
 6377        output_vcf = []
 6378        # Wrong folder to look in
 6379        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6380            if (
 6381                files
 6382                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6383            ):
 6384                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6385        # log.debug(os.listdir(options.get("output_folder")))
 6386        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6387        if not output_vcf:
 6388            log.debug(
 6389                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6390            )
 6391        else:
 6392            # Get new header from annotated vcf
 6393            log.debug(f"Initial header: {len(header.infos)} fields")
 6394            # Create new header with splice infos
 6395            new_vcf = Variants(input=output_vcf[0])
 6396            new_vcf_header = new_vcf.get_header().infos
 6397            for keys, infos in new_vcf_header.items():
 6398                if keys not in header.infos.keys():
 6399                    header.infos[keys] = infos
 6400            log.debug(f"New header: {len(header.infos)} fields")
 6401            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6402            self.update_from_vcf(output_vcf[0])
 6403
 6404        # Remove folder
 6405        remove_if_exists(output_folder)
 6406
 6407    ###
 6408    # Prioritization
 6409    ###
 6410
 6411    def get_config_default(self, name: str) -> dict:
 6412        """
 6413        The function `get_config_default` returns a dictionary containing default configurations for
 6414        various calculations and prioritizations.
 6415
 6416        :param name: The `get_config_default` function returns a dictionary containing default
 6417        configurations for different calculations and prioritizations. The `name` parameter is used to
 6418        specify which specific configuration to retrieve from the dictionary
 6419        :type name: str
 6420        :return: The function `get_config_default` returns a dictionary containing default configuration
 6421        settings for different calculations and prioritizations. The specific configuration settings are
 6422        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6423        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6424        returned. If there is no match, an empty dictionary is returned.
 6425        """
 6426
 6427        config_default = {
 6428            "calculations": {
 6429                "variant_chr_pos_alt_ref": {
 6430                    "type": "sql",
 6431                    "name": "variant_chr_pos_alt_ref",
 6432                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6433                    "available": False,
 6434                    "output_column_name": "variant_chr_pos_alt_ref",
 6435                    "output_column_type": "String",
 6436                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6437                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6438                    "operation_info": True,
 6439                },
 6440                "VARTYPE": {
 6441                    "type": "sql",
 6442                    "name": "VARTYPE",
 6443                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6444                    "available": True,
 6445                    "output_column_name": "VARTYPE",
 6446                    "output_column_type": "String",
 6447                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6448                    "operation_query": """
 6449                            CASE
 6450                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6451                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6452                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6453                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6454                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6455                                ELSE 'UNDEFINED'
 6456                            END
 6457                            """,
 6458                    "info_fields": ["SVTYPE"],
 6459                    "operation_info": True,
 6460                },
 6461                "snpeff_hgvs": {
 6462                    "type": "python",
 6463                    "name": "snpeff_hgvs",
 6464                    "description": "HGVS nomenclatures from snpEff annotation",
 6465                    "available": True,
 6466                    "function_name": "calculation_extract_snpeff_hgvs",
 6467                    "function_params": ["snpeff_hgvs", "ANN"],
 6468                },
 6469                "snpeff_ann_explode": {
 6470                    "type": "python",
 6471                    "name": "snpeff_ann_explode",
 6472                    "description": "Explode snpEff annotations with uniquify values",
 6473                    "available": True,
 6474                    "function_name": "calculation_snpeff_ann_explode",
 6475                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6476                },
 6477                "snpeff_ann_explode_uniquify": {
 6478                    "type": "python",
 6479                    "name": "snpeff_ann_explode_uniquify",
 6480                    "description": "Explode snpEff annotations",
 6481                    "available": True,
 6482                    "function_name": "calculation_snpeff_ann_explode",
 6483                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6484                },
 6485                "snpeff_ann_explode_json": {
 6486                    "type": "python",
 6487                    "name": "snpeff_ann_explode_json",
 6488                    "description": "Explode snpEff annotations in JSON format",
 6489                    "available": True,
 6490                    "function_name": "calculation_snpeff_ann_explode",
 6491                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6492                },
 6493                "NOMEN": {
 6494                    "type": "python",
 6495                    "name": "NOMEN",
 6496                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6497                    "available": True,
 6498                    "function_name": "calculation_extract_nomen",
 6499                    "function_params": [],
 6500                },
 6501                "FINDBYPIPELINE": {
 6502                    "type": "python",
 6503                    "name": "FINDBYPIPELINE",
 6504                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6505                    "available": True,
 6506                    "function_name": "calculation_find_by_pipeline",
 6507                    "function_params": ["findbypipeline"],
 6508                },
 6509                "FINDBYSAMPLE": {
 6510                    "type": "python",
 6511                    "name": "FINDBYSAMPLE",
 6512                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6513                    "available": True,
 6514                    "function_name": "calculation_find_by_pipeline",
 6515                    "function_params": ["findbysample"],
 6516                },
 6517                "GENOTYPECONCORDANCE": {
 6518                    "type": "python",
 6519                    "name": "GENOTYPECONCORDANCE",
 6520                    "description": "Concordance of genotype for multi caller VCF",
 6521                    "available": True,
 6522                    "function_name": "calculation_genotype_concordance",
 6523                    "function_params": [],
 6524                },
 6525                "BARCODE": {
 6526                    "type": "python",
 6527                    "name": "BARCODE",
 6528                    "description": "BARCODE as VaRank tool",
 6529                    "available": True,
 6530                    "function_name": "calculation_barcode",
 6531                    "function_params": [],
 6532                },
 6533                "BARCODEFAMILY": {
 6534                    "type": "python",
 6535                    "name": "BARCODEFAMILY",
 6536                    "description": "BARCODEFAMILY as VaRank tool",
 6537                    "available": True,
 6538                    "function_name": "calculation_barcode_family",
 6539                    "function_params": ["BCF"],
 6540                },
 6541                "TRIO": {
 6542                    "type": "python",
 6543                    "name": "TRIO",
 6544                    "description": "Inheritance for a trio family",
 6545                    "available": True,
 6546                    "function_name": "calculation_trio",
 6547                    "function_params": [],
 6548                },
 6549                "VAF": {
 6550                    "type": "python",
 6551                    "name": "VAF",
 6552                    "description": "Variant Allele Frequency (VAF) harmonization",
 6553                    "available": True,
 6554                    "function_name": "calculation_vaf_normalization",
 6555                    "function_params": [],
 6556                },
 6557                "VAF_stats": {
 6558                    "type": "python",
 6559                    "name": "VAF_stats",
 6560                    "description": "Variant Allele Frequency (VAF) statistics",
 6561                    "available": True,
 6562                    "function_name": "calculation_genotype_stats",
 6563                    "function_params": ["VAF"],
 6564                },
 6565                "DP_stats": {
 6566                    "type": "python",
 6567                    "name": "DP_stats",
 6568                    "description": "Depth (DP) statistics",
 6569                    "available": True,
 6570                    "function_name": "calculation_genotype_stats",
 6571                    "function_params": ["DP"],
 6572                },
 6573                "variant_id": {
 6574                    "type": "python",
 6575                    "name": "variant_id",
 6576                    "description": "Variant ID generated from variant position and type",
 6577                    "available": True,
 6578                    "function_name": "calculation_variant_id",
 6579                    "function_params": [],
 6580                },
 6581                "transcripts_json": {
 6582                    "type": "python",
 6583                    "name": "transcripts_json",
 6584                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6585                    "available": True,
 6586                    "function_name": "calculation_transcripts_annotation",
 6587                    "function_params": ["transcripts_json", None],
 6588                },
 6589                "transcripts_ann": {
 6590                    "type": "python",
 6591                    "name": "transcripts_ann",
 6592                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6593                    "available": True,
 6594                    "function_name": "calculation_transcripts_annotation",
 6595                    "function_params": [None, "transcripts_ann"],
 6596                },
 6597                "transcripts_annotations": {
 6598                    "type": "python",
 6599                    "name": "transcripts_annotations",
 6600                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6601                    "available": True,
 6602                    "function_name": "calculation_transcripts_annotation",
 6603                    "function_params": [None, None],
 6604                },
 6605                "transcripts_prioritization": {
 6606                    "type": "python",
 6607                    "name": "transcripts_prioritization",
 6608                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6609                    "available": True,
 6610                    "function_name": "calculation_transcripts_prioritization",
 6611                    "function_params": [],
 6612                },
 6613            },
 6614            "prioritizations": {
 6615                "default": {
 6616                    "filter": [
 6617                        {
 6618                            "type": "notequals",
 6619                            "value": "!PASS|\\.",
 6620                            "score": 0,
 6621                            "flag": "FILTERED",
 6622                            "comment": ["Bad variant quality"],
 6623                        },
 6624                        {
 6625                            "type": "equals",
 6626                            "value": "REJECT",
 6627                            "score": -20,
 6628                            "flag": "PASS",
 6629                            "comment": ["Bad variant quality"],
 6630                        },
 6631                    ],
 6632                    "DP": [
 6633                        {
 6634                            "type": "gte",
 6635                            "value": "50",
 6636                            "score": 5,
 6637                            "flag": "PASS",
 6638                            "comment": ["DP higher than 50"],
 6639                        }
 6640                    ],
 6641                    "ANN": [
 6642                        {
 6643                            "type": "contains",
 6644                            "value": "HIGH",
 6645                            "score": 5,
 6646                            "flag": "PASS",
 6647                            "comment": [
 6648                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6649                            ],
 6650                        },
 6651                        {
 6652                            "type": "contains",
 6653                            "value": "MODERATE",
 6654                            "score": 3,
 6655                            "flag": "PASS",
 6656                            "comment": [
 6657                                "A non-disruptive variant that might change protein effectiveness"
 6658                            ],
 6659                        },
 6660                        {
 6661                            "type": "contains",
 6662                            "value": "LOW",
 6663                            "score": 0,
 6664                            "flag": "FILTERED",
 6665                            "comment": [
 6666                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6667                            ],
 6668                        },
 6669                        {
 6670                            "type": "contains",
 6671                            "value": "MODIFIER",
 6672                            "score": 0,
 6673                            "flag": "FILTERED",
 6674                            "comment": [
 6675                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6676                            ],
 6677                        },
 6678                    ],
 6679                }
 6680            },
 6681        }
 6682
 6683        return config_default.get(name, None)
 6684
 6685    def get_config_json(
 6686        self, name: str, config_dict: dict = {}, config_file: str = None
 6687    ) -> dict:
 6688        """
 6689        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6690        default values, a dictionary, and a file.
 6691
 6692        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6693        the name of the configuration. It is used to identify and retrieve the configuration settings
 6694        for a specific component or module
 6695        :type name: str
 6696        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6697        dictionary that allows you to provide additional configuration settings or overrides. When you
 6698        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6699        the key is the configuration setting you want to override or
 6700        :type config_dict: dict
 6701        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6702        specify the path to a configuration file that contains additional settings. If provided, the
 6703        function will read the contents of this file and update the configuration dictionary with the
 6704        values found in the file, overriding any existing values with the
 6705        :type config_file: str
 6706        :return: The function `get_config_json` returns a dictionary containing the configuration
 6707        settings.
 6708        """
 6709
 6710        # Create with default prioritizations
 6711        config_default = self.get_config_default(name=name)
 6712        configuration = config_default
 6713        # log.debug(f"configuration={configuration}")
 6714
 6715        # Replace prioritizations from dict
 6716        for config in config_dict:
 6717            configuration[config] = config_dict[config]
 6718
 6719        # Replace prioritizations from file
 6720        config_file = full_path(config_file)
 6721        if config_file:
 6722            if os.path.exists(config_file):
 6723                with open(config_file) as config_file_content:
 6724                    config_file_dict = json.load(config_file_content)
 6725                for config in config_file_dict:
 6726                    configuration[config] = config_file_dict[config]
 6727            else:
 6728                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6729                log.error(msg_error)
 6730                raise ValueError(msg_error)
 6731
 6732        return configuration
 6733
 6734    def prioritization(
 6735        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6736    ) -> bool:
 6737        """
 6738        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6739        prioritizes variants based on configured profiles and criteria.
 6740
 6741        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6742        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6743        a table name is provided, the method will prioritize the variants in that specific table
 6744        :type table: str
 6745        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6746        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6747        provided, the code will use a default prefix value of "PZ"
 6748        :type pz_prefix: str
 6749        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6750        additional parameters specific to the prioritization process. These parameters can include
 6751        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6752        configurations needed for the prioritization of variants in a V
 6753        :type pz_param: dict
 6754        :return: A boolean value (True) is being returned from the `prioritization` function.
 6755        """
 6756
 6757        # Config
 6758        config = self.get_config()
 6759
 6760        # Param
 6761        param = self.get_param()
 6762
 6763        # Prioritization param
 6764        if pz_param is not None:
 6765            prioritization_param = pz_param
 6766        else:
 6767            prioritization_param = param.get("prioritization", {})
 6768
 6769        # Configuration profiles
 6770        prioritization_config_file = prioritization_param.get(
 6771            "prioritization_config", None
 6772        )
 6773        prioritization_config_file = full_path(prioritization_config_file)
 6774        prioritizations_config = self.get_config_json(
 6775            name="prioritizations", config_file=prioritization_config_file
 6776        )
 6777
 6778        # Prioritization prefix
 6779        pz_prefix_default = "PZ"
 6780        if pz_prefix is None:
 6781            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6782
 6783        # Prioritization options
 6784        profiles = prioritization_param.get("profiles", [])
 6785        if isinstance(profiles, str):
 6786            profiles = profiles.split(",")
 6787        pzfields = prioritization_param.get(
 6788            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6789        )
 6790        if isinstance(pzfields, str):
 6791            pzfields = pzfields.split(",")
 6792        default_profile = prioritization_param.get("default_profile", None)
 6793        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6794        prioritization_score_mode = prioritization_param.get(
 6795            "prioritization_score_mode", "HOWARD"
 6796        )
 6797
 6798        # Quick Prioritizations
 6799        prioritizations = param.get("prioritizations", None)
 6800        if prioritizations:
 6801            log.info("Quick Prioritization:")
 6802            for profile in prioritizations.split(","):
 6803                if profile not in profiles:
 6804                    profiles.append(profile)
 6805                    log.info(f"   {profile}")
 6806
 6807        # If profile "ALL" provided, all profiles in the config profiles
 6808        if "ALL" in profiles:
 6809            profiles = list(prioritizations_config.keys())
 6810
 6811        for profile in profiles:
 6812            if prioritizations_config.get(profile, None):
 6813                log.debug(f"Profile '{profile}' configured")
 6814            else:
 6815                msg_error = f"Profile '{profile}' NOT configured"
 6816                log.error(msg_error)
 6817                raise ValueError(msg_error)
 6818
 6819        if profiles:
 6820            log.info(f"Prioritization... ")
 6821        else:
 6822            log.debug(f"No profile defined")
 6823            return False
 6824
 6825        if not default_profile and len(profiles):
 6826            default_profile = profiles[0]
 6827
 6828        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6829        log.debug("Profiles to check: " + str(list(profiles)))
 6830
 6831        # Variables
 6832        if table is not None:
 6833            table_variants = table
 6834        else:
 6835            table_variants = self.get_table_variants(clause="update")
 6836        log.debug(f"Table to prioritize: {table_variants}")
 6837
 6838        # Added columns
 6839        added_columns = []
 6840
 6841        # Create list of PZfields
 6842        # List of PZFields
 6843        list_of_pzfields_original = pzfields + [
 6844            pzfield + pzfields_sep + profile
 6845            for pzfield in pzfields
 6846            for profile in profiles
 6847        ]
 6848        list_of_pzfields = []
 6849        log.debug(f"{list_of_pzfields_original}")
 6850
 6851        # Remove existing PZfields to use if exists
 6852        for pzfield in list_of_pzfields_original:
 6853            if self.get_header().infos.get(pzfield, None) is None:
 6854                list_of_pzfields.append(pzfield)
 6855                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6856            else:
 6857                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6858
 6859        if list_of_pzfields:
 6860
 6861            # Explode Infos prefix
 6862            explode_infos_prefix = self.get_explode_infos_prefix()
 6863
 6864            # PZfields tags description
 6865            PZfields_INFOS = {
 6866                f"{pz_prefix}Tags": {
 6867                    "ID": f"{pz_prefix}Tags",
 6868                    "Number": ".",
 6869                    "Type": "String",
 6870                    "Description": "Variant tags based on annotation criteria",
 6871                },
 6872                f"{pz_prefix}Score": {
 6873                    "ID": f"{pz_prefix}Score",
 6874                    "Number": 1,
 6875                    "Type": "Integer",
 6876                    "Description": "Variant score based on annotation criteria",
 6877                },
 6878                f"{pz_prefix}Flag": {
 6879                    "ID": f"{pz_prefix}Flag",
 6880                    "Number": 1,
 6881                    "Type": "String",
 6882                    "Description": "Variant flag based on annotation criteria",
 6883                },
 6884                f"{pz_prefix}Comment": {
 6885                    "ID": f"{pz_prefix}Comment",
 6886                    "Number": ".",
 6887                    "Type": "String",
 6888                    "Description": "Variant comment based on annotation criteria",
 6889                },
 6890                f"{pz_prefix}Infos": {
 6891                    "ID": f"{pz_prefix}Infos",
 6892                    "Number": ".",
 6893                    "Type": "String",
 6894                    "Description": "Variant infos based on annotation criteria",
 6895                },
 6896            }
 6897
 6898            # Create INFO fields if not exist
 6899            for field in PZfields_INFOS:
 6900                field_ID = PZfields_INFOS[field]["ID"]
 6901                field_description = PZfields_INFOS[field]["Description"]
 6902                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6903                    field_description = (
 6904                        PZfields_INFOS[field]["Description"]
 6905                        + f", profile {default_profile}"
 6906                    )
 6907                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6908                        field_ID,
 6909                        PZfields_INFOS[field]["Number"],
 6910                        PZfields_INFOS[field]["Type"],
 6911                        field_description,
 6912                        "unknown",
 6913                        "unknown",
 6914                        code_type_map[PZfields_INFOS[field]["Type"]],
 6915                    )
 6916
 6917            # Create INFO fields if not exist for each profile
 6918            for profile in prioritizations_config:
 6919                if profile in profiles or profiles == []:
 6920                    for field in PZfields_INFOS:
 6921                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6922                        field_description = (
 6923                            PZfields_INFOS[field]["Description"]
 6924                            + f", profile {profile}"
 6925                        )
 6926                        if (
 6927                            field_ID not in self.get_header().infos
 6928                            and field in pzfields
 6929                        ):
 6930                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6931                                field_ID,
 6932                                PZfields_INFOS[field]["Number"],
 6933                                PZfields_INFOS[field]["Type"],
 6934                                field_description,
 6935                                "unknown",
 6936                                "unknown",
 6937                                code_type_map[PZfields_INFOS[field]["Type"]],
 6938                            )
 6939
 6940            # Header
 6941            for pzfield in list_of_pzfields:
 6942                if re.match(f"{pz_prefix}Score.*", pzfield):
 6943                    added_column = self.add_column(
 6944                        table_name=table_variants,
 6945                        column_name=pzfield,
 6946                        column_type="INTEGER",
 6947                        default_value="0",
 6948                    )
 6949                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6950                    added_column = self.add_column(
 6951                        table_name=table_variants,
 6952                        column_name=pzfield,
 6953                        column_type="BOOLEAN",
 6954                        default_value="1",
 6955                    )
 6956                else:
 6957                    added_column = self.add_column(
 6958                        table_name=table_variants,
 6959                        column_name=pzfield,
 6960                        column_type="STRING",
 6961                        default_value="''",
 6962                    )
 6963                added_columns.append(added_column)
 6964
 6965            # Profiles
 6966            if profiles:
 6967
 6968                # foreach profile in configuration file
 6969                for profile in prioritizations_config:
 6970
 6971                    # If profile is asked in param, or ALL are asked (empty profile [])
 6972                    if profile in profiles or profiles == []:
 6973                        log.info(f"Profile '{profile}'")
 6974
 6975                        sql_set_info_option = ""
 6976
 6977                        sql_set_info = []
 6978
 6979                        # PZ fields set
 6980
 6981                        # PZScore
 6982                        if (
 6983                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6984                            in list_of_pzfields
 6985                        ):
 6986                            sql_set_info.append(
 6987                                f"""
 6988                                    concat(
 6989                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6990                                        {pz_prefix}Score{pzfields_sep}{profile}
 6991                                    ) 
 6992                                """
 6993                            )
 6994                            if (
 6995                                profile == default_profile
 6996                                and f"{pz_prefix}Score" in list_of_pzfields
 6997                            ):
 6998                                sql_set_info.append(
 6999                                    f"""
 7000                                        concat(
 7001                                            '{pz_prefix}Score=',
 7002                                            {pz_prefix}Score{pzfields_sep}{profile}
 7003                                        )
 7004                                    """
 7005                                )
 7006
 7007                        # PZFlag
 7008                        if (
 7009                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7010                            in list_of_pzfields
 7011                        ):
 7012                            sql_set_info.append(
 7013                                f"""
 7014                                    concat(
 7015                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7016                                        CASE 
 7017                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7018                                            THEN 'PASS'
 7019                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7020                                            THEN 'FILTERED'
 7021                                        END
 7022                                    ) 
 7023                                """
 7024                            )
 7025                            if (
 7026                                profile == default_profile
 7027                                and f"{pz_prefix}Flag" in list_of_pzfields
 7028                            ):
 7029                                sql_set_info.append(
 7030                                    f"""
 7031                                        concat(
 7032                                            '{pz_prefix}Flag=',
 7033                                            CASE 
 7034                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7035                                                THEN 'PASS'
 7036                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7037                                                THEN 'FILTERED'
 7038                                            END
 7039                                        )
 7040                                    """
 7041                                )
 7042
 7043                        # PZComment
 7044                        if (
 7045                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7046                            in list_of_pzfields
 7047                        ):
 7048                            sql_set_info.append(
 7049                                f"""
 7050                                    CASE
 7051                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7052                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7053                                        ELSE ''
 7054                                    END
 7055                                """
 7056                            )
 7057                            if (
 7058                                profile == default_profile
 7059                                and f"{pz_prefix}Comment" in list_of_pzfields
 7060                            ):
 7061                                sql_set_info.append(
 7062                                    f"""
 7063                                        CASE
 7064                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7065                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7066                                            ELSE ''
 7067                                        END
 7068                                    """
 7069                                )
 7070
 7071                        # PZInfos
 7072                        if (
 7073                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7074                            in list_of_pzfields
 7075                        ):
 7076                            sql_set_info.append(
 7077                                f"""
 7078                                    CASE
 7079                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7080                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7081                                        ELSE ''
 7082                                    END
 7083                                """
 7084                            )
 7085                            if (
 7086                                profile == default_profile
 7087                                and f"{pz_prefix}Infos" in list_of_pzfields
 7088                            ):
 7089                                sql_set_info.append(
 7090                                    f"""
 7091                                        CASE
 7092                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7093                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7094                                            ELSE ''
 7095                                        END
 7096                                    """
 7097                                )
 7098
 7099                        # Merge PZfields
 7100                        sql_set_info_option = ""
 7101                        sql_set_sep = ""
 7102                        for sql_set in sql_set_info:
 7103                            if sql_set_sep:
 7104                                sql_set_info_option += f"""
 7105                                    , concat('{sql_set_sep}', {sql_set})
 7106                                """
 7107                            else:
 7108                                sql_set_info_option += f"""
 7109                                    , {sql_set}
 7110                                """
 7111                            sql_set_sep = ";"
 7112
 7113                        sql_queries = []
 7114                        for annotation in prioritizations_config[profile]:
 7115
 7116                            # Explode specific annotation
 7117                            log.debug(f"Explode annotation '{annotation}'")
 7118                            added_columns += self.explode_infos(
 7119                                prefix=explode_infos_prefix,
 7120                                fields=[annotation],
 7121                                table=table_variants,
 7122                            )
 7123                            extra_infos = self.get_extra_infos(table=table_variants)
 7124
 7125                            # Check if annotation field is present
 7126                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
 7127                                log.debug(f"Annotation '{annotation}' not in data")
 7128                                continue
 7129                            else:
 7130                                log.debug(f"Annotation '{annotation}' in data")
 7131
 7132                            # For each criterions
 7133                            for criterion in prioritizations_config[profile][
 7134                                annotation
 7135                            ]:
 7136                                criterion_type = criterion["type"]
 7137                                criterion_value = criterion["value"]
 7138                                criterion_score = criterion.get("score", 0)
 7139                                criterion_flag = criterion.get("flag", "PASS")
 7140                                criterion_flag_bool = criterion_flag == "PASS"
 7141                                criterion_comment = (
 7142                                    ", ".join(criterion.get("comment", []))
 7143                                    .replace("'", "''")
 7144                                    .replace(";", ",")
 7145                                    .replace("\t", " ")
 7146                                )
 7147                                criterion_infos = (
 7148                                    str(criterion)
 7149                                    .replace("'", "''")
 7150                                    .replace(";", ",")
 7151                                    .replace("\t", " ")
 7152                                )
 7153
 7154                                sql_set = []
 7155                                sql_set_info = []
 7156
 7157                                # PZ fields set
 7158                                if (
 7159                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7160                                    in list_of_pzfields
 7161                                ):
 7162                                    if prioritization_score_mode == "HOWARD":
 7163                                        sql_set.append(
 7164                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7165                                        )
 7166                                    elif prioritization_score_mode == "VaRank":
 7167                                        sql_set.append(
 7168                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7169                                        )
 7170                                    else:
 7171                                        sql_set.append(
 7172                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7173                                        )
 7174                                if (
 7175                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7176                                    in list_of_pzfields
 7177                                ):
 7178                                    sql_set.append(
 7179                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7180                                    )
 7181                                if (
 7182                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7183                                    in list_of_pzfields
 7184                                ):
 7185                                    sql_set.append(
 7186                                        f"""
 7187                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7188                                                concat(
 7189                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7190                                                    CASE 
 7191                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7192                                                        THEN ', '
 7193                                                        ELSE ''
 7194                                                    END,
 7195                                                    '{criterion_comment}'
 7196                                                )
 7197                                        """
 7198                                    )
 7199                                if (
 7200                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7201                                    in list_of_pzfields
 7202                                ):
 7203                                    sql_set.append(
 7204                                        f"""
 7205                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7206                                                concat(
 7207                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7208                                                    '{criterion_infos}'
 7209                                                )
 7210                                        """
 7211                                    )
 7212                                sql_set_option = ",".join(sql_set)
 7213
 7214                                # Criterion and comparison
 7215                                if sql_set_option:
 7216                                    try:
 7217                                        float(criterion_value)
 7218                                        sql_update = f"""
 7219                                            UPDATE {table_variants}
 7220                                            SET {sql_set_option}
 7221                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7222                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7223                                            """
 7224                                    except:
 7225                                        contains_option = ""
 7226                                        if criterion_type == "contains":
 7227                                            contains_option = ".*"
 7228                                        sql_update = f"""
 7229                                            UPDATE {table_variants}
 7230                                            SET {sql_set_option}
 7231                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7232                                            """
 7233                                    sql_queries.append(sql_update)
 7234                                else:
 7235                                    log.warning(
 7236                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7237                                    )
 7238
 7239                        # PZTags
 7240                        if (
 7241                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7242                            in list_of_pzfields
 7243                        ):
 7244
 7245                            # Create PZFalgs value
 7246                            pztags_value = ""
 7247                            pztags_sep_default = "|"
 7248                            pztags_sep = ""
 7249                            for pzfield in pzfields:
 7250                                if pzfield not in [f"{pz_prefix}Tags"]:
 7251                                    if (
 7252                                        f"{pzfield}{pzfields_sep}{profile}"
 7253                                        in list_of_pzfields
 7254                                    ):
 7255                                        if pzfield in [f"{pz_prefix}Flag"]:
 7256                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7257                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7258                                                    THEN 'PASS'
 7259                                                    ELSE 'FILTERED'
 7260                                                END, '"""
 7261                                        else:
 7262                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7263                                        pztags_sep = pztags_sep_default
 7264
 7265                            # Add Query update for PZFlags
 7266                            sql_update_pztags = f"""
 7267                                UPDATE {table_variants}
 7268                                SET INFO = concat(
 7269                                        INFO,
 7270                                        CASE WHEN INFO NOT in ('','.')
 7271                                                THEN ';'
 7272                                                ELSE ''
 7273                                        END,
 7274                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7275                                    )
 7276                                """
 7277                            sql_queries.append(sql_update_pztags)
 7278
 7279                            # Add Query update for PZFlags for default
 7280                            if profile == default_profile:
 7281                                sql_update_pztags_default = f"""
 7282                                UPDATE {table_variants}
 7283                                SET INFO = concat(
 7284                                        INFO,
 7285                                        ';',
 7286                                        '{pz_prefix}Tags={pztags_value}'
 7287                                    )
 7288                                """
 7289                                sql_queries.append(sql_update_pztags_default)
 7290
 7291                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7292
 7293                        if sql_queries:
 7294
 7295                            for sql_query in sql_queries:
 7296                                log.debug(
 7297                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7298                                )
 7299                                self.conn.execute(sql_query)
 7300
 7301                        log.info(f"""Profile '{profile}' - Update... """)
 7302                        sql_query_update = f"""
 7303                            UPDATE {table_variants}
 7304                            SET INFO =  
 7305                                concat(
 7306                                    CASE
 7307                                        WHEN INFO NOT IN ('','.')
 7308                                        THEN concat(INFO, ';')
 7309                                        ELSE ''
 7310                                    END
 7311                                    {sql_set_info_option}
 7312                                )
 7313                        """
 7314                        self.conn.execute(sql_query_update)
 7315
 7316        else:
 7317
 7318            log.warning(f"No profiles in parameters")
 7319
 7320        # Remove added columns
 7321        for added_column in added_columns:
 7322            self.drop_column(column=added_column)
 7323
 7324        # Explode INFOS fields into table fields
 7325        if self.get_explode_infos():
 7326            self.explode_infos(
 7327                prefix=self.get_explode_infos_prefix(),
 7328                fields=self.get_explode_infos_fields(),
 7329                force=True,
 7330            )
 7331
 7332        return True
 7333
 7334    ###
 7335    # HGVS
 7336    ###
 7337
 7338    def annotation_hgvs(self, threads: int = None) -> None:
 7339        """
 7340        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7341        coordinates and alleles.
 7342
 7343        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7344        threads to use for parallel processing. If no value is provided, it will default to the number
 7345        of threads obtained from the `get_threads()` method
 7346        :type threads: int
 7347        """
 7348
 7349        # Function for each partition of the Dask Dataframe
 7350        def partition_function(partition):
 7351            """
 7352            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7353            each row of a DataFrame called `partition`.
 7354
 7355            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7356            to be processed
 7357            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7358            the "partition" dataframe along the axis 1.
 7359            """
 7360            return partition.apply(annotation_hgvs_partition, axis=1)
 7361
 7362        def annotation_hgvs_partition(row) -> str:
 7363            """
 7364            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7365            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7366
 7367            :param row: A dictionary-like object that contains the values for the following keys:
 7368            :return: a string that contains the HGVS names associated with the given row of data.
 7369            """
 7370
 7371            chr = row["CHROM"]
 7372            pos = row["POS"]
 7373            ref = row["REF"]
 7374            alt = row["ALT"]
 7375
 7376            # Find list of associated transcripts
 7377            transcripts_list = list(
 7378                polars_conn.execute(
 7379                    f"""
 7380                SELECT transcript
 7381                FROM refseq_df
 7382                WHERE CHROM='{chr}'
 7383                AND POS={pos}
 7384            """
 7385                )["transcript"]
 7386            )
 7387
 7388            # Full HGVS annotation in list
 7389            hgvs_full_list = []
 7390
 7391            for transcript_name in transcripts_list:
 7392
 7393                # Transcript
 7394                transcript = get_transcript(
 7395                    transcripts=transcripts, transcript_name=transcript_name
 7396                )
 7397                # Exon
 7398                if use_exon:
 7399                    exon = transcript.find_exon_number(pos)
 7400                else:
 7401                    exon = None
 7402                # Protein
 7403                transcript_protein = None
 7404                if use_protein or add_protein or full_format:
 7405                    transcripts_protein = list(
 7406                        polars_conn.execute(
 7407                            f"""
 7408                        SELECT protein
 7409                        FROM refseqlink_df
 7410                        WHERE transcript='{transcript_name}'
 7411                        LIMIT 1
 7412                    """
 7413                        )["protein"]
 7414                    )
 7415                    if len(transcripts_protein):
 7416                        transcript_protein = transcripts_protein[0]
 7417
 7418                # HGVS name
 7419                hgvs_name = format_hgvs_name(
 7420                    chr,
 7421                    pos,
 7422                    ref,
 7423                    alt,
 7424                    genome=genome,
 7425                    transcript=transcript,
 7426                    transcript_protein=transcript_protein,
 7427                    exon=exon,
 7428                    use_gene=use_gene,
 7429                    use_protein=use_protein,
 7430                    full_format=full_format,
 7431                    use_version=use_version,
 7432                    codon_type=codon_type,
 7433                )
 7434                hgvs_full_list.append(hgvs_name)
 7435                if add_protein and not use_protein and not full_format:
 7436                    hgvs_name = format_hgvs_name(
 7437                        chr,
 7438                        pos,
 7439                        ref,
 7440                        alt,
 7441                        genome=genome,
 7442                        transcript=transcript,
 7443                        transcript_protein=transcript_protein,
 7444                        exon=exon,
 7445                        use_gene=use_gene,
 7446                        use_protein=True,
 7447                        full_format=False,
 7448                        use_version=use_version,
 7449                        codon_type=codon_type,
 7450                    )
 7451                    hgvs_full_list.append(hgvs_name)
 7452
 7453            # Create liste of HGVS annotations
 7454            hgvs_full = ",".join(hgvs_full_list)
 7455
 7456            return hgvs_full
 7457
 7458        # Polars connexion
 7459        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7460
 7461        # Config
 7462        config = self.get_config()
 7463
 7464        # Databases
 7465        # Genome
 7466        databases_genomes_folders = (
 7467            config.get("folders", {})
 7468            .get("databases", {})
 7469            .get("genomes", DEFAULT_GENOME_FOLDER)
 7470        )
 7471        databases_genome = (
 7472            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7473        )
 7474        # refseq database folder
 7475        databases_refseq_folders = (
 7476            config.get("folders", {})
 7477            .get("databases", {})
 7478            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7479        )
 7480        # refseq
 7481        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7482        # refSeqLink
 7483        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7484
 7485        # Param
 7486        param = self.get_param()
 7487
 7488        # Quick HGVS
 7489        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7490            log.info(f"Quick HGVS Annotation:")
 7491            if not param.get("hgvs", None):
 7492                param["hgvs"] = {}
 7493            for option in param.get("hgvs_options", "").split(","):
 7494                option_var_val = option.split("=")
 7495                option_var = option_var_val[0]
 7496                if len(option_var_val) > 1:
 7497                    option_val = option_var_val[1]
 7498                else:
 7499                    option_val = "True"
 7500                if option_val.upper() in ["TRUE"]:
 7501                    option_val = True
 7502                elif option_val.upper() in ["FALSE"]:
 7503                    option_val = False
 7504                log.info(f"   {option_var}={option_val}")
 7505                param["hgvs"][option_var] = option_val
 7506
 7507        # Check if HGVS annotation enabled
 7508        if "hgvs" in param:
 7509            log.info(f"HGVS Annotation... ")
 7510            for hgvs_option in param.get("hgvs", {}):
 7511                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7512        else:
 7513            return
 7514
 7515        # HGVS Param
 7516        param_hgvs = param.get("hgvs", {})
 7517        use_exon = param_hgvs.get("use_exon", False)
 7518        use_gene = param_hgvs.get("use_gene", False)
 7519        use_protein = param_hgvs.get("use_protein", False)
 7520        add_protein = param_hgvs.get("add_protein", False)
 7521        full_format = param_hgvs.get("full_format", False)
 7522        use_version = param_hgvs.get("use_version", False)
 7523        codon_type = param_hgvs.get("codon_type", "3")
 7524
 7525        # refSseq refSeqLink
 7526        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7527        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7528
 7529        # Assembly
 7530        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7531
 7532        # Genome
 7533        genome_file = None
 7534        if find_genome(databases_genome):
 7535            genome_file = find_genome(databases_genome)
 7536        else:
 7537            genome_file = find_genome(
 7538                genome_path=databases_genomes_folders, assembly=assembly
 7539            )
 7540        log.debug("Genome: " + str(genome_file))
 7541
 7542        # refSseq
 7543        refseq_file = find_file_prefix(
 7544            input_file=databases_refseq,
 7545            prefix="ncbiRefSeq",
 7546            folder=databases_refseq_folders,
 7547            assembly=assembly,
 7548        )
 7549        log.debug("refSeq: " + str(refseq_file))
 7550
 7551        # refSeqLink
 7552        refseqlink_file = find_file_prefix(
 7553            input_file=databases_refseqlink,
 7554            prefix="ncbiRefSeqLink",
 7555            folder=databases_refseq_folders,
 7556            assembly=assembly,
 7557        )
 7558        log.debug("refSeqLink: " + str(refseqlink_file))
 7559
 7560        # Threads
 7561        if not threads:
 7562            threads = self.get_threads()
 7563        log.debug("Threads: " + str(threads))
 7564
 7565        # Variables
 7566        table_variants = self.get_table_variants(clause="update")
 7567
 7568        # Get variants SNV and InDel only
 7569        query_variants = f"""
 7570            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7571            FROM {table_variants}
 7572            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7573            """
 7574        df_variants = self.get_query_to_df(query_variants)
 7575
 7576        # Added columns
 7577        added_columns = []
 7578
 7579        # Add hgvs column in variants table
 7580        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7581        added_column = self.add_column(
 7582            table_variants, hgvs_column_name, "STRING", default_value=None
 7583        )
 7584        added_columns.append(added_column)
 7585
 7586        log.debug(f"refSeq loading...")
 7587        # refSeq in duckDB
 7588        refseq_table = get_refseq_table(
 7589            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7590        )
 7591        # Loading all refSeq in Dataframe
 7592        refseq_query = f"""
 7593            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7594            FROM {refseq_table}
 7595            JOIN df_variants ON (
 7596                {refseq_table}.chrom = df_variants.CHROM
 7597                AND {refseq_table}.txStart<=df_variants.POS
 7598                AND {refseq_table}.txEnd>=df_variants.POS
 7599            )
 7600        """
 7601        refseq_df = self.conn.query(refseq_query).pl()
 7602
 7603        if refseqlink_file:
 7604            log.debug(f"refSeqLink loading...")
 7605            # refSeqLink in duckDB
 7606            refseqlink_table = get_refseq_table(
 7607                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7608            )
 7609            # Loading all refSeqLink in Dataframe
 7610            protacc_column = "protAcc_with_ver"
 7611            mrnaacc_column = "mrnaAcc_with_ver"
 7612            refseqlink_query = f"""
 7613                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7614                FROM {refseqlink_table} 
 7615                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7616                WHERE protAcc_without_ver IS NOT NULL
 7617            """
 7618            # Polars Dataframe
 7619            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7620
 7621        # Read RefSeq transcripts into a python dict/model.
 7622        log.debug(f"Transcripts loading...")
 7623        with tempfile.TemporaryDirectory() as tmpdir:
 7624            transcripts_query = f"""
 7625                COPY (
 7626                    SELECT {refseq_table}.*
 7627                    FROM {refseq_table}
 7628                    JOIN df_variants ON (
 7629                        {refseq_table}.chrom=df_variants.CHROM
 7630                        AND {refseq_table}.txStart<=df_variants.POS
 7631                        AND {refseq_table}.txEnd>=df_variants.POS
 7632                    )
 7633                )
 7634                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7635            """
 7636            self.conn.query(transcripts_query)
 7637            with open(f"{tmpdir}/transcript.tsv") as infile:
 7638                transcripts = read_transcripts(infile)
 7639
 7640        # Polars connexion
 7641        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7642
 7643        log.debug("Genome loading...")
 7644        # Read genome sequence using pyfaidx.
 7645        genome = Fasta(genome_file)
 7646
 7647        log.debug("Start annotation HGVS...")
 7648
 7649        # Create
 7650        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7651        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7652
 7653        # Use dask.dataframe.apply() to apply function on each partition
 7654        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7655
 7656        # Convert Dask DataFrame to Pandas Dataframe
 7657        df = ddf.compute()
 7658
 7659        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7660        with tempfile.TemporaryDirectory() as tmpdir:
 7661            df_parquet = os.path.join(tmpdir, "df.parquet")
 7662            df.to_parquet(df_parquet)
 7663
 7664            # Update hgvs column
 7665            update_variant_query = f"""
 7666                UPDATE {table_variants}
 7667                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7668                FROM read_parquet('{df_parquet}') as df
 7669                WHERE variants."#CHROM" = df.CHROM
 7670                AND variants.POS = df.POS
 7671                AND variants.REF = df.REF
 7672                AND variants.ALT = df.ALT
 7673                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7674                """
 7675            self.execute_query(update_variant_query)
 7676
 7677        # Update INFO column
 7678        sql_query_update = f"""
 7679            UPDATE {table_variants}
 7680            SET INFO = 
 7681                concat(
 7682                    CASE 
 7683                        WHEN INFO NOT IN ('','.')
 7684                        THEN concat(INFO, ';')
 7685                        ELSE ''
 7686                    END,
 7687                    'hgvs=',
 7688                    {hgvs_column_name}
 7689                )
 7690            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7691            """
 7692        self.execute_query(sql_query_update)
 7693
 7694        # Add header
 7695        HGVS_INFOS = {
 7696            "hgvs": {
 7697                "ID": "hgvs",
 7698                "Number": ".",
 7699                "Type": "String",
 7700                "Description": f"HGVS annotatation with HOWARD",
 7701            }
 7702        }
 7703
 7704        for field in HGVS_INFOS:
 7705            field_ID = HGVS_INFOS[field]["ID"]
 7706            field_description = HGVS_INFOS[field]["Description"]
 7707            self.get_header().infos[field_ID] = vcf.parser._Info(
 7708                field_ID,
 7709                HGVS_INFOS[field]["Number"],
 7710                HGVS_INFOS[field]["Type"],
 7711                field_description,
 7712                "unknown",
 7713                "unknown",
 7714                code_type_map[HGVS_INFOS[field]["Type"]],
 7715            )
 7716
 7717        # Remove added columns
 7718        for added_column in added_columns:
 7719            self.drop_column(column=added_column)
 7720
 7721    ###
 7722    # Calculation
 7723    ###
 7724
 7725    def get_operations_help(
 7726        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7727    ) -> list:
 7728
 7729        # Init
 7730        operations_help = []
 7731
 7732        # operations
 7733        operations = self.get_config_json(
 7734            name="calculations",
 7735            config_dict=operations_config_dict,
 7736            config_file=operations_config_file,
 7737        )
 7738        for op in operations:
 7739            op_name = operations[op].get("name", op).upper()
 7740            op_description = operations[op].get("description", op_name)
 7741            op_available = operations[op].get("available", False)
 7742            if op_available:
 7743                operations_help.append(f"   {op_name}: {op_description}")
 7744
 7745        # Sort operations
 7746        operations_help.sort()
 7747
 7748        # insert header
 7749        operations_help.insert(0, "Available calculation operations:")
 7750
 7751        # Return
 7752        return operations_help
 7753
 7754    def calculation(
 7755        self,
 7756        operations: dict = {},
 7757        operations_config_dict: dict = {},
 7758        operations_config_file: str = None,
 7759    ) -> None:
 7760        """
 7761        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7762        operation, and then calls the appropriate function
 7763
 7764        param json example:
 7765            "calculation": {
 7766                "NOMEN": {
 7767                    "options": {
 7768                        "hgvs_field": "hgvs"
 7769                    },
 7770                "middle" : null
 7771            }
 7772        """
 7773
 7774        # Param
 7775        param = self.get_param()
 7776
 7777        # operations config
 7778        operations_config = self.get_config_json(
 7779            name="calculations",
 7780            config_dict=operations_config_dict,
 7781            config_file=operations_config_file,
 7782        )
 7783
 7784        # Upper keys
 7785        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7786
 7787        # Calculations
 7788
 7789        # Operations from param
 7790        operations = param.get("calculation", {}).get("calculations", operations)
 7791
 7792        # Quick calculation - add
 7793        if param.get("calculations", None):
 7794            calculations_list = [
 7795                value for value in param.get("calculations", "").split(",")
 7796            ]
 7797            log.info(f"Quick Calculations:")
 7798            for calculation_key in calculations_list:
 7799                log.info(f"   {calculation_key}")
 7800            for calculation_operation in calculations_list:
 7801                if calculation_operation.upper() not in operations:
 7802                    operations[calculation_operation.upper()] = {}
 7803                    add_value_into_dict(
 7804                        dict_tree=param,
 7805                        sections=[
 7806                            "calculation",
 7807                            "calculations",
 7808                            calculation_operation.upper(),
 7809                        ],
 7810                        value={},
 7811                    )
 7812
 7813        # Operations for calculation
 7814        if not operations:
 7815            operations = param.get("calculation", {}).get("calculations", {})
 7816
 7817        if operations:
 7818            log.info(f"Calculations...")
 7819
 7820        # For each operations
 7821        for operation_name in operations:
 7822            operation_name = operation_name.upper()
 7823            if operation_name not in [""]:
 7824                if operation_name in operations_config:
 7825                    log.info(f"Calculation '{operation_name}'")
 7826                    operation = operations_config[operation_name]
 7827                    operation_type = operation.get("type", "sql")
 7828                    if operation_type == "python":
 7829                        self.calculation_process_function(
 7830                            operation=operation, operation_name=operation_name
 7831                        )
 7832                    elif operation_type == "sql":
 7833                        self.calculation_process_sql(
 7834                            operation=operation, operation_name=operation_name
 7835                        )
 7836                    else:
 7837                        log.error(
 7838                            f"Operations config: Type '{operation_type}' NOT available"
 7839                        )
 7840                        raise ValueError(
 7841                            f"Operations config: Type '{operation_type}' NOT available"
 7842                        )
 7843                else:
 7844                    log.error(
 7845                        f"Operations config: Calculation '{operation_name}' NOT available"
 7846                    )
 7847                    raise ValueError(
 7848                        f"Operations config: Calculation '{operation_name}' NOT available"
 7849                    )
 7850
 7851        # Explode INFOS fields into table fields
 7852        if self.get_explode_infos():
 7853            self.explode_infos(
 7854                prefix=self.get_explode_infos_prefix(),
 7855                fields=self.get_explode_infos_fields(),
 7856                force=True,
 7857            )
 7858
 7859    def calculation_process_sql(
 7860        self, operation: dict, operation_name: str = "unknown"
 7861    ) -> None:
 7862        """
 7863        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7864        performs the operation, updating the specified table with the result.
 7865
 7866        :param operation: The `operation` parameter is a dictionary that contains information about the
 7867        mathematical operation to be performed. It includes the following keys:
 7868        :type operation: dict
 7869        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7870        the mathematical operation being performed. It is used for logging and error handling purposes,
 7871        defaults to unknown
 7872        :type operation_name: str (optional)
 7873        """
 7874
 7875        # table variants
 7876        table_variants = self.get_table_variants(clause="alter")
 7877
 7878        # Operation infos
 7879        operation_name = operation.get("name", "unknown")
 7880        log.debug(f"process sql {operation_name}")
 7881        output_column_name = operation.get("output_column_name", operation_name)
 7882        output_column_type = operation.get("output_column_type", "String")
 7883        prefix = operation.get("explode_infos_prefix", "")
 7884        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7885        output_column_description = operation.get(
 7886            "output_column_description", f"{operation_name} operation"
 7887        )
 7888        operation_query = operation.get("operation_query", None)
 7889        if isinstance(operation_query, list):
 7890            operation_query = " ".join(operation_query)
 7891        operation_info_fields = operation.get("info_fields", [])
 7892        operation_info_fields_check = operation.get("info_fields_check", False)
 7893        operation_info = operation.get("operation_info", True)
 7894
 7895        if operation_query:
 7896
 7897            # Info fields check
 7898            operation_info_fields_check_result = True
 7899            if operation_info_fields_check:
 7900                header_infos = self.get_header().infos
 7901                for info_field in operation_info_fields:
 7902                    operation_info_fields_check_result = (
 7903                        operation_info_fields_check_result
 7904                        and info_field in header_infos
 7905                    )
 7906
 7907            # If info fields available
 7908            if operation_info_fields_check_result:
 7909
 7910                # Added_columns
 7911                added_columns = []
 7912
 7913                # Create VCF header field
 7914                vcf_reader = self.get_header()
 7915                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 7916                    output_column_name,
 7917                    ".",
 7918                    output_column_type,
 7919                    output_column_description,
 7920                    "howard calculation",
 7921                    "0",
 7922                    self.code_type_map.get(output_column_type),
 7923                )
 7924
 7925                # Explode infos if needed
 7926                log.debug(f"calculation_process_sql prefix {prefix}")
 7927                added_columns += self.explode_infos(
 7928                    prefix=prefix,
 7929                    fields=[output_column_name] + operation_info_fields,
 7930                    force=True,
 7931                )
 7932
 7933                # Create column
 7934                added_column = self.add_column(
 7935                    table_name=table_variants,
 7936                    column_name=prefix + output_column_name,
 7937                    column_type=output_column_type_sql,
 7938                    default_value="null",
 7939                )
 7940                added_columns.append(added_column)
 7941
 7942                # Operation calculation
 7943                try:
 7944
 7945                    # Query to update calculation column
 7946                    sql_update = f"""
 7947                        UPDATE {table_variants}
 7948                        SET "{prefix}{output_column_name}" = ({operation_query})
 7949                    """
 7950                    self.conn.execute(sql_update)
 7951
 7952                    # Add to INFO
 7953                    if operation_info:
 7954                        sql_update_info = f"""
 7955                            UPDATE {table_variants}
 7956                            SET "INFO" =
 7957                                concat(
 7958                                    CASE
 7959                                        WHEN "INFO" IS NOT NULL
 7960                                        THEN concat("INFO", ';')
 7961                                        ELSE ''
 7962                                    END,
 7963                                    '{output_column_name}=',
 7964                                    "{prefix}{output_column_name}"
 7965                                )
 7966                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 7967                        """
 7968                        self.conn.execute(sql_update_info)
 7969
 7970                except:
 7971                    log.error(
 7972                        f"Operations config: Calculation '{operation_name}' query failed"
 7973                    )
 7974                    raise ValueError(
 7975                        f"Operations config: Calculation '{operation_name}' query failed"
 7976                    )
 7977
 7978                # Remove added columns
 7979                for added_column in added_columns:
 7980                    log.debug(f"added_column: {added_column}")
 7981                    self.drop_column(column=added_column)
 7982
 7983            else:
 7984                log.error(
 7985                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7986                )
 7987                raise ValueError(
 7988                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7989                )
 7990
 7991        else:
 7992            log.error(
 7993                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7994            )
 7995            raise ValueError(
 7996                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7997            )
 7998
 7999    def calculation_process_function(
 8000        self, operation: dict, operation_name: str = "unknown"
 8001    ) -> None:
 8002        """
 8003        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8004        function with the given parameters.
 8005
 8006        :param operation: The `operation` parameter is a dictionary that contains information about the
 8007        operation to be performed. It has the following keys:
 8008        :type operation: dict
 8009        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8010        the operation being performed. It is used for logging purposes, defaults to unknown
 8011        :type operation_name: str (optional)
 8012        """
 8013
 8014        operation_name = operation["name"]
 8015        log.debug(f"process sql {operation_name}")
 8016        function_name = operation["function_name"]
 8017        function_params = operation["function_params"]
 8018        getattr(self, function_name)(*function_params)
 8019
 8020    def calculation_variant_id(self) -> None:
 8021        """
 8022        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8023        updates the INFO field of a variants table with the variant ID.
 8024        """
 8025
 8026        # variant_id annotation field
 8027        variant_id_tag = self.get_variant_id_column()
 8028        added_columns = [variant_id_tag]
 8029
 8030        # variant_id hgvs tags"
 8031        vcf_infos_tags = {
 8032            variant_id_tag: "howard variant ID annotation",
 8033        }
 8034
 8035        # Variants table
 8036        table_variants = self.get_table_variants()
 8037
 8038        # Header
 8039        vcf_reader = self.get_header()
 8040
 8041        # Add variant_id to header
 8042        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8043            variant_id_tag,
 8044            ".",
 8045            "String",
 8046            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8047            "howard calculation",
 8048            "0",
 8049            self.code_type_map.get("String"),
 8050        )
 8051
 8052        # Update
 8053        sql_update = f"""
 8054            UPDATE {table_variants}
 8055            SET "INFO" = 
 8056                concat(
 8057                    CASE
 8058                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8059                        THEN ''
 8060                        ELSE concat("INFO", ';')
 8061                    END,
 8062                    '{variant_id_tag}=',
 8063                    "{variant_id_tag}"
 8064                )
 8065        """
 8066        self.conn.execute(sql_update)
 8067
 8068        # Remove added columns
 8069        for added_column in added_columns:
 8070            self.drop_column(column=added_column)
 8071
 8072    def calculation_extract_snpeff_hgvs(
 8073        self,
 8074        snpeff_hgvs: str = "snpeff_hgvs",
 8075        snpeff_field: str = "ANN",
 8076    ) -> None:
 8077        """
 8078        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8079        annotation field in a VCF file and adds them as a new column in the variants table.
 8080
 8081        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8082        function is used to specify the name of the column that will store the HGVS nomenclatures
 8083        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8084        snpeff_hgvs
 8085        :type snpeff_hgvs: str (optional)
 8086        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8087        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8088        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8089        to ANN
 8090        :type snpeff_field: str (optional)
 8091        """
 8092
 8093        # Snpeff hgvs tags
 8094        vcf_infos_tags = {
 8095            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8096        }
 8097
 8098        # Prefix
 8099        prefix = self.get_explode_infos_prefix()
 8100        if prefix:
 8101            prefix = "INFO/"
 8102
 8103        # snpEff fields
 8104        speff_ann_infos = prefix + snpeff_field
 8105        speff_hgvs_infos = prefix + snpeff_hgvs
 8106
 8107        # Variants table
 8108        table_variants = self.get_table_variants()
 8109
 8110        # Header
 8111        vcf_reader = self.get_header()
 8112
 8113        # Add columns
 8114        added_columns = []
 8115
 8116        # Explode HGVS field in column
 8117        added_columns += self.explode_infos(fields=[snpeff_field])
 8118
 8119        if snpeff_field in vcf_reader.infos:
 8120
 8121            log.debug(vcf_reader.infos[snpeff_field])
 8122
 8123            # Extract ANN header
 8124            ann_description = vcf_reader.infos[snpeff_field].desc
 8125            pattern = r"'(.+?)'"
 8126            match = re.search(pattern, ann_description)
 8127            if match:
 8128                ann_header_match = match.group(1).split(" | ")
 8129                ann_header_desc = {}
 8130                for i in range(len(ann_header_match)):
 8131                    ann_header_info = "".join(
 8132                        char for char in ann_header_match[i] if char.isalnum()
 8133                    )
 8134                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8135                if not ann_header_desc:
 8136                    raise ValueError("Invalid header description format")
 8137            else:
 8138                raise ValueError("Invalid header description format")
 8139
 8140            # Create variant id
 8141            variant_id_column = self.get_variant_id_column()
 8142            added_columns += [variant_id_column]
 8143
 8144            # Create dataframe
 8145            dataframe_snpeff_hgvs = self.get_query_to_df(
 8146                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8147            )
 8148
 8149            # Create main NOMEN column
 8150            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8151                speff_ann_infos
 8152            ].apply(
 8153                lambda x: extract_snpeff_hgvs(
 8154                    str(x), header=list(ann_header_desc.values())
 8155                )
 8156            )
 8157
 8158            # Add snpeff_hgvs to header
 8159            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8160                snpeff_hgvs,
 8161                ".",
 8162                "String",
 8163                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8164                "howard calculation",
 8165                "0",
 8166                self.code_type_map.get("String"),
 8167            )
 8168
 8169            # Update
 8170            sql_update = f"""
 8171                UPDATE variants
 8172                SET "INFO" = 
 8173                    concat(
 8174                        CASE
 8175                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8176                            THEN ''
 8177                            ELSE concat("INFO", ';')
 8178                        END,
 8179                        CASE 
 8180                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8181                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8182                            THEN concat(
 8183                                    '{snpeff_hgvs}=',
 8184                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8185                                )
 8186                            ELSE ''
 8187                        END
 8188                    )
 8189                FROM dataframe_snpeff_hgvs
 8190                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8191
 8192            """
 8193            self.conn.execute(sql_update)
 8194
 8195            # Delete dataframe
 8196            del dataframe_snpeff_hgvs
 8197            gc.collect()
 8198
 8199        else:
 8200
 8201            log.warning(
 8202                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8203            )
 8204
 8205        # Remove added columns
 8206        for added_column in added_columns:
 8207            self.drop_column(column=added_column)
 8208
 8209    def calculation_snpeff_ann_explode(
 8210        self,
 8211        uniquify: bool = True,
 8212        output_format: str = "fields",
 8213        output_prefix: str = "snpeff_",
 8214        snpeff_field: str = "ANN",
 8215    ) -> None:
 8216        """
 8217        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8218        exploding the HGVS field and updating variant information accordingly.
 8219
 8220        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8221        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8222        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8223        defaults to True
 8224        :type uniquify: bool (optional)
 8225        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8226        function specifies the format in which the output annotations will be generated. It has a
 8227        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8228        format, defaults to fields
 8229        :type output_format: str (optional)
 8230        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8231        method is used to specify the prefix that will be added to the output annotations generated
 8232        during the calculation process. This prefix helps to differentiate the newly added annotations
 8233        from existing ones in the output data. By default, the, defaults to ANN_
 8234        :type output_prefix: str (optional)
 8235        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8236        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8237        field will be processed to explode the HGVS annotations and update the variant information
 8238        accordingly, defaults to ANN
 8239        :type snpeff_field: str (optional)
 8240        """
 8241
 8242        # SnpEff annotation field
 8243        snpeff_hgvs = "snpeff_ann_explode"
 8244
 8245        # Snpeff hgvs tags
 8246        vcf_infos_tags = {
 8247            snpeff_hgvs: "Explode snpEff annotations",
 8248        }
 8249
 8250        # Prefix
 8251        prefix = self.get_explode_infos_prefix()
 8252        if prefix:
 8253            prefix = "INFO/"
 8254
 8255        # snpEff fields
 8256        speff_ann_infos = prefix + snpeff_field
 8257        speff_hgvs_infos = prefix + snpeff_hgvs
 8258
 8259        # Variants table
 8260        table_variants = self.get_table_variants()
 8261
 8262        # Header
 8263        vcf_reader = self.get_header()
 8264
 8265        # Add columns
 8266        added_columns = []
 8267
 8268        # Explode HGVS field in column
 8269        added_columns += self.explode_infos(fields=[snpeff_field])
 8270        log.debug(f"snpeff_field={snpeff_field}")
 8271        log.debug(f"added_columns={added_columns}")
 8272
 8273        if snpeff_field in vcf_reader.infos:
 8274
 8275            # Extract ANN header
 8276            ann_description = vcf_reader.infos[snpeff_field].desc
 8277            pattern = r"'(.+?)'"
 8278            match = re.search(pattern, ann_description)
 8279            if match:
 8280                ann_header_match = match.group(1).split(" | ")
 8281                ann_header = []
 8282                ann_header_desc = {}
 8283                for i in range(len(ann_header_match)):
 8284                    ann_header_info = "".join(
 8285                        char for char in ann_header_match[i] if char.isalnum()
 8286                    )
 8287                    ann_header.append(ann_header_info)
 8288                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8289                if not ann_header_desc:
 8290                    raise ValueError("Invalid header description format")
 8291            else:
 8292                raise ValueError("Invalid header description format")
 8293
 8294            # Create variant id
 8295            variant_id_column = self.get_variant_id_column()
 8296            added_columns += [variant_id_column]
 8297
 8298            # Create dataframe
 8299            dataframe_snpeff_hgvs = self.get_query_to_df(
 8300                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8301            )
 8302
 8303            # Create snpEff columns
 8304            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8305                speff_ann_infos
 8306            ].apply(
 8307                lambda x: explode_snpeff_ann(
 8308                    str(x),
 8309                    uniquify=uniquify,
 8310                    output_format=output_format,
 8311                    prefix=output_prefix,
 8312                    header=list(ann_header_desc.values()),
 8313                )
 8314            )
 8315
 8316            # Header
 8317            ann_annotations_prefix = ""
 8318            if output_format.upper() in ["JSON"]:
 8319                ann_annotations_prefix = f"{output_prefix}="
 8320                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8321                    output_prefix,
 8322                    ".",
 8323                    "String",
 8324                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8325                    + " - JSON format",
 8326                    "howard calculation",
 8327                    "0",
 8328                    self.code_type_map.get("String"),
 8329                )
 8330            else:
 8331                for ann_annotation in ann_header:
 8332                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8333                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8334                        ann_annotation_id,
 8335                        ".",
 8336                        "String",
 8337                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8338                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8339                        "howard calculation",
 8340                        "0",
 8341                        self.code_type_map.get("String"),
 8342                    )
 8343
 8344            # Update
 8345            sql_update = f"""
 8346                UPDATE variants
 8347                SET "INFO" = 
 8348                    concat(
 8349                        CASE
 8350                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8351                            THEN ''
 8352                            ELSE concat("INFO", ';')
 8353                        END,
 8354                        CASE 
 8355                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8356                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8357                            THEN concat(
 8358                                '{ann_annotations_prefix}',
 8359                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8360                                )
 8361                            ELSE ''
 8362                        END
 8363                    )
 8364                FROM dataframe_snpeff_hgvs
 8365                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8366
 8367            """
 8368            self.conn.execute(sql_update)
 8369
 8370            # Delete dataframe
 8371            del dataframe_snpeff_hgvs
 8372            gc.collect()
 8373
 8374        else:
 8375
 8376            log.warning(
 8377                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8378            )
 8379
 8380        # Remove added columns
 8381        for added_column in added_columns:
 8382            self.drop_column(column=added_column)
 8383
 8384    def calculation_extract_nomen(self) -> None:
 8385        """
 8386        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8387        """
 8388
 8389        # NOMEN field
 8390        field_nomen_dict = "NOMEN_DICT"
 8391
 8392        # NOMEN structure
 8393        nomen_dict = {
 8394            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8395            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8396            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8397            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8398            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8399            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8400            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8401            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8402            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8403            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8404        }
 8405
 8406        # Param
 8407        param = self.get_param()
 8408
 8409        # Prefix
 8410        prefix = self.get_explode_infos_prefix()
 8411
 8412        # Header
 8413        vcf_reader = self.get_header()
 8414
 8415        # Get HGVS field
 8416        hgvs_field = (
 8417            param.get("calculation", {})
 8418            .get("calculations", {})
 8419            .get("NOMEN", {})
 8420            .get("options", {})
 8421            .get("hgvs_field", "hgvs")
 8422        )
 8423
 8424        # Get transcripts
 8425        transcripts_file = (
 8426            param.get("calculation", {})
 8427            .get("calculations", {})
 8428            .get("NOMEN", {})
 8429            .get("options", {})
 8430            .get("transcripts", None)
 8431        )
 8432        transcripts_file = full_path(transcripts_file)
 8433        transcripts = []
 8434        if transcripts_file:
 8435            if os.path.exists(transcripts_file):
 8436                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8437                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8438            else:
 8439                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8440                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8441
 8442        # Added columns
 8443        added_columns = []
 8444
 8445        # Explode HGVS field in column
 8446        added_columns += self.explode_infos(fields=[hgvs_field])
 8447
 8448        # extra infos
 8449        extra_infos = self.get_extra_infos()
 8450        extra_field = prefix + hgvs_field
 8451
 8452        if extra_field in extra_infos:
 8453
 8454            # Create dataframe
 8455            dataframe_hgvs = self.get_query_to_df(
 8456                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8457            )
 8458
 8459            # Create main NOMEN column
 8460            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8461                lambda x: find_nomen(str(x), transcripts=transcripts)
 8462            )
 8463
 8464            # Explode NOMEN Structure and create SQL set for update
 8465            sql_nomen_fields = []
 8466            for nomen_field in nomen_dict:
 8467
 8468                # Explode each field into a column
 8469                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8470                    lambda x: dict(x).get(nomen_field, "")
 8471                )
 8472
 8473                # Create VCF header field
 8474                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8475                    nomen_field,
 8476                    ".",
 8477                    "String",
 8478                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8479                    "howard calculation",
 8480                    "0",
 8481                    self.code_type_map.get("String"),
 8482                )
 8483                sql_nomen_fields.append(
 8484                    f"""
 8485                        CASE 
 8486                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8487                            THEN concat(
 8488                                    ';{nomen_field}=',
 8489                                    dataframe_hgvs."{nomen_field}"
 8490                                )
 8491                            ELSE ''
 8492                        END
 8493                    """
 8494                )
 8495
 8496            # SQL set for update
 8497            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8498
 8499            # Update
 8500            sql_update = f"""
 8501                UPDATE variants
 8502                SET "INFO" = 
 8503                    concat(
 8504                        CASE
 8505                            WHEN "INFO" IS NULL
 8506                            THEN ''
 8507                            ELSE "INFO"
 8508                        END,
 8509                        {sql_nomen_fields_set}
 8510                    )
 8511                FROM dataframe_hgvs
 8512                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8513                    AND variants."POS" = dataframe_hgvs."POS" 
 8514                    AND variants."REF" = dataframe_hgvs."REF"
 8515                    AND variants."ALT" = dataframe_hgvs."ALT"
 8516            """
 8517            self.conn.execute(sql_update)
 8518
 8519            # Delete dataframe
 8520            del dataframe_hgvs
 8521            gc.collect()
 8522
 8523        # Remove added columns
 8524        for added_column in added_columns:
 8525            self.drop_column(column=added_column)
 8526
 8527    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8528        """
 8529        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8530        pipeline/sample for a variant and updates the variant information in a VCF file.
 8531
 8532        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8533        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8534        VCF header and to update the corresponding field in the variants table, defaults to
 8535        findbypipeline
 8536        :type tag: str (optional)
 8537        """
 8538
 8539        # if FORMAT and samples
 8540        if (
 8541            "FORMAT" in self.get_header_columns_as_list()
 8542            and self.get_header_sample_list()
 8543        ):
 8544
 8545            # findbypipeline annotation field
 8546            findbypipeline_tag = tag
 8547
 8548            # VCF infos tags
 8549            vcf_infos_tags = {
 8550                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8551            }
 8552
 8553            # Prefix
 8554            prefix = self.get_explode_infos_prefix()
 8555
 8556            # Field
 8557            findbypipeline_infos = prefix + findbypipeline_tag
 8558
 8559            # Variants table
 8560            table_variants = self.get_table_variants()
 8561
 8562            # Header
 8563            vcf_reader = self.get_header()
 8564
 8565            # Create variant id
 8566            variant_id_column = self.get_variant_id_column()
 8567            added_columns = [variant_id_column]
 8568
 8569            # variant_id, FORMAT and samples
 8570            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8571                self.get_header_sample_list()
 8572            )
 8573
 8574            # Create dataframe
 8575            dataframe_findbypipeline = self.get_query_to_df(
 8576                f""" SELECT {samples_fields} FROM {table_variants} """
 8577            )
 8578
 8579            # Create findbypipeline column
 8580            dataframe_findbypipeline[findbypipeline_infos] = (
 8581                dataframe_findbypipeline.apply(
 8582                    lambda row: findbypipeline(
 8583                        row, samples=self.get_header_sample_list()
 8584                    ),
 8585                    axis=1,
 8586                )
 8587            )
 8588
 8589            # Add snpeff_hgvs to header
 8590            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8591                findbypipeline_tag,
 8592                ".",
 8593                "String",
 8594                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8595                "howard calculation",
 8596                "0",
 8597                self.code_type_map.get("String"),
 8598            )
 8599
 8600            # Update
 8601            sql_update = f"""
 8602                UPDATE variants
 8603                SET "INFO" = 
 8604                    concat(
 8605                        CASE
 8606                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8607                            THEN ''
 8608                            ELSE concat("INFO", ';')
 8609                        END,
 8610                        CASE 
 8611                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8612                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8613                            THEN concat(
 8614                                    '{findbypipeline_tag}=',
 8615                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8616                                )
 8617                            ELSE ''
 8618                        END
 8619                    )
 8620                FROM dataframe_findbypipeline
 8621                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8622            """
 8623            self.conn.execute(sql_update)
 8624
 8625            # Remove added columns
 8626            for added_column in added_columns:
 8627                self.drop_column(column=added_column)
 8628
 8629            # Delete dataframe
 8630            del dataframe_findbypipeline
 8631            gc.collect()
 8632
 8633    def calculation_genotype_concordance(self) -> None:
 8634        """
 8635        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8636        multi-caller VCF files and updates the variant information in the database.
 8637        """
 8638
 8639        # if FORMAT and samples
 8640        if (
 8641            "FORMAT" in self.get_header_columns_as_list()
 8642            and self.get_header_sample_list()
 8643        ):
 8644
 8645            # genotypeconcordance annotation field
 8646            genotypeconcordance_tag = "genotypeconcordance"
 8647
 8648            # VCF infos tags
 8649            vcf_infos_tags = {
 8650                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8651            }
 8652
 8653            # Prefix
 8654            prefix = self.get_explode_infos_prefix()
 8655
 8656            # Field
 8657            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8658
 8659            # Variants table
 8660            table_variants = self.get_table_variants()
 8661
 8662            # Header
 8663            vcf_reader = self.get_header()
 8664
 8665            # Create variant id
 8666            variant_id_column = self.get_variant_id_column()
 8667            added_columns = [variant_id_column]
 8668
 8669            # variant_id, FORMAT and samples
 8670            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8671                self.get_header_sample_list()
 8672            )
 8673
 8674            # Create dataframe
 8675            dataframe_genotypeconcordance = self.get_query_to_df(
 8676                f""" SELECT {samples_fields} FROM {table_variants} """
 8677            )
 8678
 8679            # Create genotypeconcordance column
 8680            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8681                dataframe_genotypeconcordance.apply(
 8682                    lambda row: genotypeconcordance(
 8683                        row, samples=self.get_header_sample_list()
 8684                    ),
 8685                    axis=1,
 8686                )
 8687            )
 8688
 8689            # Add genotypeconcordance to header
 8690            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8691                genotypeconcordance_tag,
 8692                ".",
 8693                "String",
 8694                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8695                "howard calculation",
 8696                "0",
 8697                self.code_type_map.get("String"),
 8698            )
 8699
 8700            # Update
 8701            sql_update = f"""
 8702                UPDATE variants
 8703                SET "INFO" = 
 8704                    concat(
 8705                        CASE
 8706                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8707                            THEN ''
 8708                            ELSE concat("INFO", ';')
 8709                        END,
 8710                        CASE
 8711                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8712                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8713                            THEN concat(
 8714                                    '{genotypeconcordance_tag}=',
 8715                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8716                                )
 8717                            ELSE ''
 8718                        END
 8719                    )
 8720                FROM dataframe_genotypeconcordance
 8721                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8722            """
 8723            self.conn.execute(sql_update)
 8724
 8725            # Remove added columns
 8726            for added_column in added_columns:
 8727                self.drop_column(column=added_column)
 8728
 8729            # Delete dataframe
 8730            del dataframe_genotypeconcordance
 8731            gc.collect()
 8732
 8733    def calculation_barcode(self, tag: str = "barcode") -> None:
 8734        """
 8735        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8736        updates the INFO field in the file with the calculated barcode values.
 8737
 8738        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8739        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8740        the default tag name is set to "barcode", defaults to barcode
 8741        :type tag: str (optional)
 8742        """
 8743
 8744        # if FORMAT and samples
 8745        if (
 8746            "FORMAT" in self.get_header_columns_as_list()
 8747            and self.get_header_sample_list()
 8748        ):
 8749
 8750            # barcode annotation field
 8751            if not tag:
 8752                tag = "barcode"
 8753
 8754            # VCF infos tags
 8755            vcf_infos_tags = {
 8756                tag: "barcode calculation (VaRank)",
 8757            }
 8758
 8759            # Prefix
 8760            prefix = self.get_explode_infos_prefix()
 8761
 8762            # Field
 8763            barcode_infos = prefix + tag
 8764
 8765            # Variants table
 8766            table_variants = self.get_table_variants()
 8767
 8768            # Header
 8769            vcf_reader = self.get_header()
 8770
 8771            # Create variant id
 8772            variant_id_column = self.get_variant_id_column()
 8773            added_columns = [variant_id_column]
 8774
 8775            # variant_id, FORMAT and samples
 8776            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8777                self.get_header_sample_list()
 8778            )
 8779
 8780            # Create dataframe
 8781            dataframe_barcode = self.get_query_to_df(
 8782                f""" SELECT {samples_fields} FROM {table_variants} """
 8783            )
 8784
 8785            # Create barcode column
 8786            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8787                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8788            )
 8789
 8790            # Add barcode to header
 8791            vcf_reader.infos[tag] = vcf.parser._Info(
 8792                tag,
 8793                ".",
 8794                "String",
 8795                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8796                "howard calculation",
 8797                "0",
 8798                self.code_type_map.get("String"),
 8799            )
 8800
 8801            # Update
 8802            sql_update = f"""
 8803                UPDATE {table_variants}
 8804                SET "INFO" = 
 8805                    concat(
 8806                        CASE
 8807                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8808                            THEN ''
 8809                            ELSE concat("INFO", ';')
 8810                        END,
 8811                        CASE
 8812                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8813                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8814                            THEN concat(
 8815                                    '{tag}=',
 8816                                    dataframe_barcode."{barcode_infos}"
 8817                                )
 8818                            ELSE ''
 8819                        END
 8820                    )
 8821                FROM dataframe_barcode
 8822                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8823            """
 8824            self.conn.execute(sql_update)
 8825
 8826            # Remove added columns
 8827            for added_column in added_columns:
 8828                self.drop_column(column=added_column)
 8829
 8830            # Delete dataframe
 8831            del dataframe_barcode
 8832            gc.collect()
 8833
 8834    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8835        """
 8836        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8837        and updates the INFO field in the file with the calculated barcode values.
 8838
 8839        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8840        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8841        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8842        :type tag: str (optional)
 8843        """
 8844
 8845        # if FORMAT and samples
 8846        if (
 8847            "FORMAT" in self.get_header_columns_as_list()
 8848            and self.get_header_sample_list()
 8849        ):
 8850
 8851            # barcode annotation field
 8852            if not tag:
 8853                tag = "BCF"
 8854
 8855            # VCF infos tags
 8856            vcf_infos_tags = {
 8857                tag: "barcode family calculation",
 8858                f"{tag}S": "barcode family samples",
 8859            }
 8860
 8861            # Param
 8862            param = self.get_param()
 8863            log.debug(f"param={param}")
 8864
 8865            # Prefix
 8866            prefix = self.get_explode_infos_prefix()
 8867
 8868            # PED param
 8869            ped = (
 8870                param.get("calculation", {})
 8871                .get("calculations", {})
 8872                .get("BARCODEFAMILY", {})
 8873                .get("family_pedigree", None)
 8874            )
 8875            log.debug(f"ped={ped}")
 8876
 8877            # Load PED
 8878            if ped:
 8879
 8880                # Pedigree is a file
 8881                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8882                    log.debug("Pedigree is file")
 8883                    with open(full_path(ped)) as ped:
 8884                        ped = json.load(ped)
 8885
 8886                # Pedigree is a string
 8887                elif isinstance(ped, str):
 8888                    log.debug("Pedigree is str")
 8889                    try:
 8890                        ped = json.loads(ped)
 8891                        log.debug("Pedigree is json str")
 8892                    except ValueError as e:
 8893                        ped_samples = ped.split(",")
 8894                        ped = {}
 8895                        for ped_sample in ped_samples:
 8896                            ped[ped_sample] = ped_sample
 8897
 8898                # Pedigree is a dict
 8899                elif isinstance(ped, dict):
 8900                    log.debug("Pedigree is dict")
 8901
 8902                # Pedigree is not well formatted
 8903                else:
 8904                    msg_error = "Pedigree not well formatted"
 8905                    log.error(msg_error)
 8906                    raise ValueError(msg_error)
 8907
 8908                # Construct list
 8909                ped_samples = list(ped.values())
 8910
 8911            else:
 8912                log.debug("Pedigree not defined. Take all samples")
 8913                ped_samples = self.get_header_sample_list()
 8914                ped = {}
 8915                for ped_sample in ped_samples:
 8916                    ped[ped_sample] = ped_sample
 8917
 8918            # Check pedigree
 8919            if not ped or len(ped) == 0:
 8920                msg_error = f"Error in pedigree: samples {ped_samples}"
 8921                log.error(msg_error)
 8922                raise ValueError(msg_error)
 8923
 8924            # Log
 8925            log.info(
 8926                "Calculation 'BARCODEFAMILY' - Samples: "
 8927                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 8928            )
 8929            log.debug(f"ped_samples={ped_samples}")
 8930
 8931            # Field
 8932            barcode_infos = prefix + tag
 8933
 8934            # Variants table
 8935            table_variants = self.get_table_variants()
 8936
 8937            # Header
 8938            vcf_reader = self.get_header()
 8939
 8940            # Create variant id
 8941            variant_id_column = self.get_variant_id_column()
 8942            added_columns = [variant_id_column]
 8943
 8944            # variant_id, FORMAT and samples
 8945            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8946                ped_samples
 8947            )
 8948
 8949            # Create dataframe
 8950            dataframe_barcode = self.get_query_to_df(
 8951                f""" SELECT {samples_fields} FROM {table_variants} """
 8952            )
 8953
 8954            # Create barcode column
 8955            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8956                lambda row: barcode(row, samples=ped_samples), axis=1
 8957            )
 8958
 8959            # Add barcode family to header
 8960            # Add vaf_normalization to header
 8961            vcf_reader.formats[tag] = vcf.parser._Format(
 8962                id=tag,
 8963                num=".",
 8964                type="String",
 8965                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 8966                type_code=self.code_type_map.get("String"),
 8967            )
 8968            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 8969                id=f"{tag}S",
 8970                num=".",
 8971                type="String",
 8972                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 8973                type_code=self.code_type_map.get("String"),
 8974            )
 8975
 8976            # Update
 8977            # for sample in ped_samples:
 8978            sql_update_set = []
 8979            for sample in self.get_header_sample_list() + ["FORMAT"]:
 8980                if sample in ped_samples:
 8981                    value = f'dataframe_barcode."{barcode_infos}"'
 8982                    value_samples = "'" + ",".join(ped_samples) + "'"
 8983                elif sample == "FORMAT":
 8984                    value = f"'{tag}'"
 8985                    value_samples = f"'{tag}S'"
 8986                else:
 8987                    value = "'.'"
 8988                    value_samples = "'.'"
 8989                format_regex = r"[a-zA-Z0-9\s]"
 8990                sql_update_set.append(
 8991                    f"""
 8992                        "{sample}" = 
 8993                        concat(
 8994                            CASE
 8995                                WHEN {table_variants}."{sample}" = './.'
 8996                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 8997                                ELSE {table_variants}."{sample}"
 8998                            END,
 8999                            ':',
 9000                            {value},
 9001                            ':',
 9002                            {value_samples}
 9003                        )
 9004                    """
 9005                )
 9006
 9007            sql_update_set_join = ", ".join(sql_update_set)
 9008            sql_update = f"""
 9009                UPDATE {table_variants}
 9010                SET {sql_update_set_join}
 9011                FROM dataframe_barcode
 9012                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9013            """
 9014            self.conn.execute(sql_update)
 9015
 9016            # Remove added columns
 9017            for added_column in added_columns:
 9018                self.drop_column(column=added_column)
 9019
 9020            # Delete dataframe
 9021            del dataframe_barcode
 9022            gc.collect()
 9023
 9024    def calculation_trio(self) -> None:
 9025        """
 9026        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9027        information to the INFO field of each variant.
 9028        """
 9029
 9030        # if FORMAT and samples
 9031        if (
 9032            "FORMAT" in self.get_header_columns_as_list()
 9033            and self.get_header_sample_list()
 9034        ):
 9035
 9036            # trio annotation field
 9037            trio_tag = "trio"
 9038
 9039            # VCF infos tags
 9040            vcf_infos_tags = {
 9041                "trio": "trio calculation",
 9042            }
 9043
 9044            # Param
 9045            param = self.get_param()
 9046
 9047            # Prefix
 9048            prefix = self.get_explode_infos_prefix()
 9049
 9050            # Trio param
 9051            trio_ped = (
 9052                param.get("calculation", {})
 9053                .get("calculations", {})
 9054                .get("TRIO", {})
 9055                .get("trio_pedigree", None)
 9056            )
 9057
 9058            # Load trio
 9059            if trio_ped:
 9060
 9061                # Trio pedigree is a file
 9062                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9063                    log.debug("TRIO pedigree is file")
 9064                    with open(full_path(trio_ped)) as trio_ped:
 9065                        trio_ped = json.load(trio_ped)
 9066
 9067                # Trio pedigree is a string
 9068                elif isinstance(trio_ped, str):
 9069                    log.debug("TRIO pedigree is str")
 9070                    try:
 9071                        trio_ped = json.loads(trio_ped)
 9072                        log.debug("TRIO pedigree is json str")
 9073                    except ValueError as e:
 9074                        trio_samples = trio_ped.split(",")
 9075                        if len(trio_samples) == 3:
 9076                            trio_ped = {
 9077                                "father": trio_samples[0],
 9078                                "mother": trio_samples[1],
 9079                                "child": trio_samples[2],
 9080                            }
 9081                            log.debug("TRIO pedigree is list str")
 9082                        else:
 9083                            msg_error = "TRIO pedigree not well formatted"
 9084                            log.error(msg_error)
 9085                            raise ValueError(msg_error)
 9086
 9087                # Trio pedigree is a dict
 9088                elif isinstance(trio_ped, dict):
 9089                    log.debug("TRIO pedigree is dict")
 9090
 9091                # Trio pedigree is not well formatted
 9092                else:
 9093                    msg_error = "TRIO pedigree not well formatted"
 9094                    log.error(msg_error)
 9095                    raise ValueError(msg_error)
 9096
 9097                # Construct trio list
 9098                trio_samples = [
 9099                    trio_ped.get("father", ""),
 9100                    trio_ped.get("mother", ""),
 9101                    trio_ped.get("child", ""),
 9102                ]
 9103
 9104            else:
 9105                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9106                samples_list = self.get_header_sample_list()
 9107                if len(samples_list) >= 3:
 9108                    trio_samples = self.get_header_sample_list()[0:3]
 9109                    trio_ped = {
 9110                        "father": trio_samples[0],
 9111                        "mother": trio_samples[1],
 9112                        "child": trio_samples[2],
 9113                    }
 9114                else:
 9115                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9116                    log.error(msg_error)
 9117                    raise ValueError(msg_error)
 9118
 9119            # Check trio pedigree
 9120            if not trio_ped or len(trio_ped) != 3:
 9121                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9122                log.error(msg_error)
 9123                raise ValueError(msg_error)
 9124
 9125            # Log
 9126            log.info(
 9127                f"Calculation 'TRIO' - Samples: "
 9128                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9129            )
 9130
 9131            # Field
 9132            trio_infos = prefix + trio_tag
 9133
 9134            # Variants table
 9135            table_variants = self.get_table_variants()
 9136
 9137            # Header
 9138            vcf_reader = self.get_header()
 9139
 9140            # Create variant id
 9141            variant_id_column = self.get_variant_id_column()
 9142            added_columns = [variant_id_column]
 9143
 9144            # variant_id, FORMAT and samples
 9145            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9146                self.get_header_sample_list()
 9147            )
 9148
 9149            # Create dataframe
 9150            dataframe_trio = self.get_query_to_df(
 9151                f""" SELECT {samples_fields} FROM {table_variants} """
 9152            )
 9153
 9154            # Create trio column
 9155            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9156                lambda row: trio(row, samples=trio_samples), axis=1
 9157            )
 9158
 9159            # Add trio to header
 9160            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9161                trio_tag,
 9162                ".",
 9163                "String",
 9164                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9165                "howard calculation",
 9166                "0",
 9167                self.code_type_map.get("String"),
 9168            )
 9169
 9170            # Update
 9171            sql_update = f"""
 9172                UPDATE {table_variants}
 9173                SET "INFO" = 
 9174                    concat(
 9175                        CASE
 9176                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9177                            THEN ''
 9178                            ELSE concat("INFO", ';')
 9179                        END,
 9180                        CASE
 9181                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9182                             AND dataframe_trio."{trio_infos}" NOT NULL
 9183                            THEN concat(
 9184                                    '{trio_tag}=',
 9185                                    dataframe_trio."{trio_infos}"
 9186                                )
 9187                            ELSE ''
 9188                        END
 9189                    )
 9190                FROM dataframe_trio
 9191                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9192            """
 9193            self.conn.execute(sql_update)
 9194
 9195            # Remove added columns
 9196            for added_column in added_columns:
 9197                self.drop_column(column=added_column)
 9198
 9199            # Delete dataframe
 9200            del dataframe_trio
 9201            gc.collect()
 9202
 9203    def calculation_vaf_normalization(self) -> None:
 9204        """
 9205        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9206        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9207        :return: The function does not return anything.
 9208        """
 9209
 9210        # if FORMAT and samples
 9211        if (
 9212            "FORMAT" in self.get_header_columns_as_list()
 9213            and self.get_header_sample_list()
 9214        ):
 9215
 9216            # vaf_normalization annotation field
 9217            vaf_normalization_tag = "VAF"
 9218
 9219            # VCF infos tags
 9220            vcf_infos_tags = {
 9221                "VAF": "VAF Variant Frequency",
 9222            }
 9223
 9224            # Prefix
 9225            prefix = self.get_explode_infos_prefix()
 9226
 9227            # Variants table
 9228            table_variants = self.get_table_variants()
 9229
 9230            # Header
 9231            vcf_reader = self.get_header()
 9232
 9233            # Do not calculate if VAF already exists
 9234            if "VAF" in vcf_reader.formats:
 9235                log.debug("VAF already on genotypes")
 9236                return
 9237
 9238            # Create variant id
 9239            variant_id_column = self.get_variant_id_column()
 9240            added_columns = [variant_id_column]
 9241
 9242            # variant_id, FORMAT and samples
 9243            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9244                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9245            )
 9246
 9247            # Create dataframe
 9248            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9249            log.debug(f"query={query}")
 9250            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9251
 9252            vaf_normalization_set = []
 9253
 9254            # for each sample vaf_normalization
 9255            for sample in self.get_header_sample_list():
 9256                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9257                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9258                )
 9259                vaf_normalization_set.append(
 9260                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9261                )
 9262
 9263            # Add VAF to FORMAT
 9264            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9265                "FORMAT"
 9266            ].apply(lambda x: str(x) + ":VAF")
 9267            vaf_normalization_set.append(
 9268                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9269            )
 9270
 9271            # Add vaf_normalization to header
 9272            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9273                id=vaf_normalization_tag,
 9274                num="1",
 9275                type="Float",
 9276                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9277                type_code=self.code_type_map.get("Float"),
 9278            )
 9279
 9280            # Create fields to add in INFO
 9281            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9282
 9283            # Update
 9284            sql_update = f"""
 9285                UPDATE {table_variants}
 9286                SET {sql_vaf_normalization_set}
 9287                FROM dataframe_vaf_normalization
 9288                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9289
 9290            """
 9291            self.conn.execute(sql_update)
 9292
 9293            # Remove added columns
 9294            for added_column in added_columns:
 9295                self.drop_column(column=added_column)
 9296
 9297            # Delete dataframe
 9298            del dataframe_vaf_normalization
 9299            gc.collect()
 9300
 9301    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9302        """
 9303        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9304        field in a VCF file and updates the INFO column of the variants table with the calculated
 9305        statistics.
 9306
 9307        :param info: The `info` parameter is a string that represents the type of information for which
 9308        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9309        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9310        maximum value, the mean, the median, defaults to VAF
 9311        :type info: str (optional)
 9312        """
 9313
 9314        # if FORMAT and samples
 9315        if (
 9316            "FORMAT" in self.get_header_columns_as_list()
 9317            and self.get_header_sample_list()
 9318        ):
 9319
 9320            # vaf_stats annotation field
 9321            vaf_stats_tag = info + "_stats"
 9322
 9323            # VCF infos tags
 9324            vcf_infos_tags = {
 9325                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9326                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9327                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9328                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9329                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9330                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9331                info
 9332                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9333            }
 9334
 9335            # Prefix
 9336            prefix = self.get_explode_infos_prefix()
 9337
 9338            # Field
 9339            vaf_stats_infos = prefix + vaf_stats_tag
 9340
 9341            # Variants table
 9342            table_variants = self.get_table_variants()
 9343
 9344            # Header
 9345            vcf_reader = self.get_header()
 9346
 9347            # Create variant id
 9348            variant_id_column = self.get_variant_id_column()
 9349            added_columns = [variant_id_column]
 9350
 9351            # variant_id, FORMAT and samples
 9352            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9353                self.get_header_sample_list()
 9354            )
 9355
 9356            # Create dataframe
 9357            dataframe_vaf_stats = self.get_query_to_df(
 9358                f""" SELECT {samples_fields} FROM {table_variants} """
 9359            )
 9360
 9361            # Create vaf_stats column
 9362            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9363                lambda row: genotype_stats(
 9364                    row, samples=self.get_header_sample_list(), info=info
 9365                ),
 9366                axis=1,
 9367            )
 9368
 9369            # List of vcf tags
 9370            sql_vaf_stats_fields = []
 9371
 9372            # Check all VAF stats infos
 9373            for stat in vcf_infos_tags:
 9374
 9375                # Extract stats
 9376                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9377                    lambda x: dict(x).get(stat, "")
 9378                )
 9379
 9380                # Add snpeff_hgvs to header
 9381                vcf_reader.infos[stat] = vcf.parser._Info(
 9382                    stat,
 9383                    ".",
 9384                    "String",
 9385                    vcf_infos_tags.get(stat, "genotype statistics"),
 9386                    "howard calculation",
 9387                    "0",
 9388                    self.code_type_map.get("String"),
 9389                )
 9390
 9391                if len(sql_vaf_stats_fields):
 9392                    sep = ";"
 9393                else:
 9394                    sep = ""
 9395
 9396                # Create fields to add in INFO
 9397                sql_vaf_stats_fields.append(
 9398                    f"""
 9399                        CASE
 9400                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9401                            THEN concat(
 9402                                    '{sep}{stat}=',
 9403                                    dataframe_vaf_stats."{stat}"
 9404                                )
 9405                            ELSE ''
 9406                        END
 9407                    """
 9408                )
 9409
 9410            # SQL set for update
 9411            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9412
 9413            # Update
 9414            sql_update = f"""
 9415                UPDATE {table_variants}
 9416                SET "INFO" = 
 9417                    concat(
 9418                        CASE
 9419                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9420                            THEN ''
 9421                            ELSE concat("INFO", ';')
 9422                        END,
 9423                        {sql_vaf_stats_fields_set}
 9424                    )
 9425                FROM dataframe_vaf_stats
 9426                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9427
 9428            """
 9429            self.conn.execute(sql_update)
 9430
 9431            # Remove added columns
 9432            for added_column in added_columns:
 9433                self.drop_column(column=added_column)
 9434
 9435            # Delete dataframe
 9436            del dataframe_vaf_stats
 9437            gc.collect()
 9438
 9439    def calculation_transcripts_annotation(
 9440        self, info_json: str = None, info_format: str = None
 9441    ) -> None:
 9442        """
 9443        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9444        field to it if transcripts are available.
 9445
 9446        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9447        is a string parameter that represents the information field to be used in the transcripts JSON.
 9448        It is used to specify the JSON format for the transcripts information. If no value is provided
 9449        when calling the method, it defaults to "
 9450        :type info_json: str
 9451        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9452        method is a string parameter that specifies the format of the information field to be used in
 9453        the transcripts JSON. It is used to define the format of the information field
 9454        :type info_format: str
 9455        """
 9456
 9457        # Create transcripts table
 9458        transcripts_table = self.create_transcript_view()
 9459
 9460        # Add info field
 9461        if transcripts_table:
 9462            self.transcript_view_to_variants(
 9463                transcripts_table=transcripts_table,
 9464                transcripts_info_field_json=info_json,
 9465                transcripts_info_field_format=info_format,
 9466            )
 9467        else:
 9468            log.info("No Transcripts to process. Check param.json file configuration")
 9469
 9470    def calculation_transcripts_prioritization(self) -> None:
 9471        """
 9472        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9473        prioritizes transcripts based on certain criteria.
 9474        """
 9475
 9476        # Create transcripts table
 9477        transcripts_table = self.create_transcript_view()
 9478
 9479        # Add info field
 9480        if transcripts_table:
 9481            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9482        else:
 9483            log.info("No Transcripts to process. Check param.json file configuration")
 9484
 9485    ###############
 9486    # Transcripts #
 9487    ###############
 9488
 9489    def transcripts_prioritization(
 9490        self, transcripts_table: str = None, param: dict = {}
 9491    ) -> bool:
 9492        """
 9493        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9494        and updates the variants table with the prioritized information.
 9495
 9496        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9497        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9498        This parameter is used to identify the table where the transcripts data is stored for the
 9499        prioritization process
 9500        :type transcripts_table: str
 9501        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9502        that contains various configuration settings for the prioritization process of transcripts. It
 9503        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9504        the prefix for prioritization fields, default profiles, and other
 9505        :type param: dict
 9506        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9507        transcripts prioritization process is successfully completed, and `False` if there are any
 9508        issues or if no profile is defined for transcripts prioritization.
 9509        """
 9510
 9511        log.debug("Start transcripts prioritization...")
 9512
 9513        # Param
 9514        if not param:
 9515            param = self.get_param()
 9516
 9517        # Variants table
 9518        table_variants = self.get_table_variants()
 9519        log.debug(f"transcripts_table={transcripts_table}")
 9520        # Transcripts table
 9521        if transcripts_table is None:
 9522            log.debug(f"transcripts_table={transcripts_table}")
 9523            transcripts_table = self.create_transcript_view(
 9524                transcripts_table="transcripts", param=param
 9525            )
 9526            log.debug(f"transcripts_table={transcripts_table}")
 9527        if transcripts_table is None:
 9528            msg_err = "No Transcripts table availalble"
 9529            log.error(msg_err)
 9530            raise ValueError(msg_err)
 9531
 9532        # Get transcripts columns
 9533        columns_as_list_query = f"""
 9534            DESCRIBE {transcripts_table}
 9535        """
 9536        columns_as_list = list(
 9537            self.get_query_to_df(columns_as_list_query)["column_name"]
 9538        )
 9539
 9540        # Create INFO if not exists
 9541        if "INFO" not in columns_as_list:
 9542            query_add_info = f"""
 9543                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9544            """
 9545            self.execute_query(query_add_info)
 9546
 9547        # Prioritization param and Force only PZ Score and Flag
 9548        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9549        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9550        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9551        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9552        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9553        pz_profile_default = (
 9554            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9555        )
 9556
 9557        # Exit if no profile
 9558        if pz_profile_default is None:
 9559            log.warning("No profile defined for transcripts prioritization")
 9560            return False
 9561
 9562        # Prioritization
 9563        prioritization_result = self.prioritization(
 9564            table=transcripts_table,
 9565            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9566        )
 9567        if not prioritization_result:
 9568            log.warning("Transcripts prioritization not processed")
 9569            return False
 9570
 9571        # Explode PZ fields
 9572        self.explode_infos(
 9573            table=transcripts_table,
 9574            fields=param.get("transcripts", {})
 9575            .get("prioritization", {})
 9576            .get("pzfields", []),
 9577        )
 9578
 9579        # Export Transcripts prioritization infos to variants table
 9580        query_update = f"""
 9581            WITH RankedTranscripts AS (
 9582                SELECT
 9583                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9584                    ROW_NUMBER() OVER (
 9585                        PARTITION BY "#CHROM", POS, REF, ALT
 9586                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9587                    ) AS rn
 9588                FROM
 9589                    {transcripts_table}
 9590            )
 9591            UPDATE {table_variants}
 9592                SET
 9593                INFO = CONCAT(CASE
 9594                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9595                            THEN ''
 9596                            ELSE concat("INFO", ';')
 9597                        END,
 9598                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9599                        )
 9600            FROM
 9601                RankedTranscripts
 9602            WHERE
 9603                rn = 1
 9604                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9605                AND variants."POS" = RankedTranscripts."POS"
 9606                AND variants."REF" = RankedTranscripts."REF"
 9607                AND variants."ALT" = RankedTranscripts."ALT"
 9608                
 9609        """
 9610        self.execute_query(query=query_update)
 9611
 9612        # Add PZ Transcript in header
 9613        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9614            pz_fields_transcripts,
 9615            ".",
 9616            "String",
 9617            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9618            "unknown",
 9619            "unknown",
 9620            code_type_map["String"],
 9621        )
 9622
 9623        # Return
 9624        return True
 9625
 9626    def create_transcript_view_from_columns_map(
 9627        self,
 9628        transcripts_table: str = "transcripts",
 9629        columns_maps: dict = {},
 9630        added_columns: list = [],
 9631        temporary_tables: list = None,
 9632        annotation_fields: list = None,
 9633    ) -> tuple[list, list, list]:
 9634        """
 9635        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9636        specified columns mapping for transcripts data.
 9637
 9638        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9639        the table where the transcripts data is stored or will be stored in the database. This table
 9640        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9641        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9642        :type transcripts_table: str (optional)
 9643        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9644        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9645        represents a mapping configuration for a specific set of columns. It typically includes details such
 9646        as the main transcript column and additional information columns
 9647        :type columns_maps: dict
 9648        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9649        function is a list that stores the additional columns that will be added to the view being created
 9650        based on the columns map provided. These columns are generated by exploding the transcript
 9651        information columns along with the main transcript column
 9652        :type added_columns: list
 9653        :param temporary_tables: The `temporary_tables` parameter in the
 9654        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9655        tables created during the process of creating a transcript view from a columns map. These temporary
 9656        tables are used to store intermediate results or transformations before the final view is generated
 9657        :type temporary_tables: list
 9658        :param annotation_fields: The `annotation_fields` parameter in the
 9659        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9660        for annotation in the query view creation process. These fields are extracted from the
 9661        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9662        :type annotation_fields: list
 9663        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9664        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9665        """
 9666
 9667        log.debug("Start transcrpts view creation from columns map...")
 9668
 9669        # "from_columns_map": [
 9670        #     {
 9671        #         "transcripts_column": "Ensembl_transcriptid",
 9672        #         "transcripts_infos_columns": [
 9673        #             "genename",
 9674        #             "Ensembl_geneid",
 9675        #             "LIST_S2_score",
 9676        #             "LIST_S2_pred",
 9677        #         ],
 9678        #     },
 9679        #     {
 9680        #         "transcripts_column": "Ensembl_transcriptid",
 9681        #         "transcripts_infos_columns": [
 9682        #             "genename",
 9683        #             "VARITY_R_score",
 9684        #             "Aloft_pred",
 9685        #         ],
 9686        #     },
 9687        # ],
 9688
 9689        # Init
 9690        if temporary_tables is None:
 9691            temporary_tables = []
 9692        if annotation_fields is None:
 9693            annotation_fields = []
 9694
 9695        # Variants table
 9696        table_variants = self.get_table_variants()
 9697
 9698        for columns_map in columns_maps:
 9699
 9700            # Transcript column
 9701            transcripts_column = columns_map.get("transcripts_column", None)
 9702
 9703            # Transcripts infos columns
 9704            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9705
 9706            if transcripts_column is not None:
 9707
 9708                # Explode
 9709                added_columns += self.explode_infos(
 9710                    fields=[transcripts_column] + transcripts_infos_columns
 9711                )
 9712
 9713                # View clauses
 9714                clause_select = []
 9715                for field in [transcripts_column] + transcripts_infos_columns:
 9716                    clause_select.append(
 9717                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9718                    )
 9719                    if field not in [transcripts_column]:
 9720                        annotation_fields.append(field)
 9721
 9722                # Querey View
 9723                query = f""" 
 9724                    SELECT
 9725                        "#CHROM", POS, REF, ALT, INFO,
 9726                        "{transcripts_column}" AS 'transcript',
 9727                        {", ".join(clause_select)}
 9728                    FROM (
 9729                        SELECT 
 9730                            "#CHROM", POS, REF, ALT, INFO,
 9731                            {", ".join(clause_select)}
 9732                        FROM {table_variants}
 9733                        )
 9734                    WHERE "{transcripts_column}" IS NOT NULL
 9735                """
 9736
 9737                # Create temporary table
 9738                temporary_table = transcripts_table + "".join(
 9739                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9740                )
 9741
 9742                # Temporary_tables
 9743                temporary_tables.append(temporary_table)
 9744                query_view = f"""
 9745                    CREATE TEMPORARY TABLE {temporary_table}
 9746                    AS ({query})
 9747                """
 9748                self.execute_query(query=query_view)
 9749
 9750        return added_columns, temporary_tables, annotation_fields
 9751
 9752    def create_transcript_view_from_column_format(
 9753        self,
 9754        transcripts_table: str = "transcripts",
 9755        column_formats: dict = {},
 9756        temporary_tables: list = None,
 9757        annotation_fields: list = None,
 9758    ) -> tuple[list, list, list]:
 9759        """
 9760        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9761        specified column formats, adds additional columns and annotation fields, and returns the list of
 9762        temporary tables and annotation fields.
 9763
 9764        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9765        the table containing the transcripts data. This table will be used as the base table for creating
 9766        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9767        different table name if needed, defaults to transcripts
 9768        :type transcripts_table: str (optional)
 9769        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9770        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9771        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9772        the provided code snippet:
 9773        :type column_formats: dict
 9774        :param temporary_tables: The `temporary_tables` parameter in the
 9775        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9776        views created during the process of creating a transcript view from a column format. These temporary
 9777        views are used to manipulate and extract data before generating the final transcript view. It
 9778        :type temporary_tables: list
 9779        :param annotation_fields: The `annotation_fields` parameter in the
 9780        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9781        that are extracted from the temporary views created during the process. These annotation fields are
 9782        obtained by querying the temporary views and extracting the column names excluding specific columns
 9783        like `#CH
 9784        :type annotation_fields: list
 9785        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9786        `temporary_tables` and `annotation_fields`.
 9787        """
 9788
 9789        log.debug("Start transcrpts view creation from column format...")
 9790
 9791        #  "from_column_format": [
 9792        #     {
 9793        #         "transcripts_column": "ANN",
 9794        #         "transcripts_infos_column": "Feature_ID",
 9795        #     }
 9796        # ],
 9797
 9798        # Init
 9799        if temporary_tables is None:
 9800            temporary_tables = []
 9801        if annotation_fields is None:
 9802            annotation_fields = []
 9803
 9804        for column_format in column_formats:
 9805
 9806            # annotation field and transcript annotation field
 9807            annotation_field = column_format.get("transcripts_column", "ANN")
 9808            transcript_annotation = column_format.get(
 9809                "transcripts_infos_column", "Feature_ID"
 9810            )
 9811
 9812            # Temporary View name
 9813            temporary_view_name = transcripts_table + "".join(
 9814                random.choices(string.ascii_uppercase + string.digits, k=10)
 9815            )
 9816
 9817            # Create temporary view name
 9818            temporary_view_name = self.annotation_format_to_table(
 9819                uniquify=True,
 9820                annotation_field=annotation_field,
 9821                view_name=temporary_view_name,
 9822                annotation_id=transcript_annotation,
 9823            )
 9824
 9825            # Annotation fields
 9826            if temporary_view_name:
 9827                query_annotation_fields = f"""
 9828                    SELECT *
 9829                    FROM (
 9830                        DESCRIBE SELECT *
 9831                        FROM {temporary_view_name}
 9832                        )
 9833                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9834                """
 9835                df_annotation_fields = self.get_query_to_df(
 9836                    query=query_annotation_fields
 9837                )
 9838
 9839                # Add temporary view and annotation fields
 9840                temporary_tables.append(temporary_view_name)
 9841                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9842
 9843        return temporary_tables, annotation_fields
 9844
 9845    def create_transcript_view(
 9846        self,
 9847        transcripts_table: str = None,
 9848        transcripts_table_drop: bool = True,
 9849        param: dict = {},
 9850    ) -> str:
 9851        """
 9852        The `create_transcript_view` function generates a transcript view by processing data from a
 9853        specified table based on provided parameters and structural information.
 9854
 9855        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9856        is used to specify the name of the table that will store the final transcript view data. If a table
 9857        name is not provided, the function will create a new table to store the transcript view data, and by
 9858        default,, defaults to transcripts
 9859        :type transcripts_table: str (optional)
 9860        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9861        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9862        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9863        the function will drop the existing transcripts table if it exists, defaults to True
 9864        :type transcripts_table_drop: bool (optional)
 9865        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9866        contains information needed to create a transcript view. It includes details such as the structure
 9867        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9868        the view. This parameter allows for flexibility and customization
 9869        :type param: dict
 9870        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9871        created or modified during the execution of the function.
 9872        """
 9873
 9874        log.debug("Start transcripts view creation...")
 9875
 9876        # Default
 9877        transcripts_table_default = "transcripts"
 9878
 9879        # Param
 9880        if not param:
 9881            param = self.get_param()
 9882
 9883        # Struct
 9884        struct = param.get("transcripts", {}).get("struct", None)
 9885
 9886        if struct:
 9887
 9888            # Transcripts table
 9889            if transcripts_table is None:
 9890                transcripts_table = param.get("transcripts", {}).get(
 9891                    "table", transcripts_table_default
 9892                )
 9893
 9894            # added_columns
 9895            added_columns = []
 9896
 9897            # Temporary tables
 9898            temporary_tables = []
 9899
 9900            # Annotation fields
 9901            annotation_fields = []
 9902
 9903            # from columns map
 9904            columns_maps = struct.get("from_columns_map", [])
 9905            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
 9906                self.create_transcript_view_from_columns_map(
 9907                    transcripts_table=transcripts_table,
 9908                    columns_maps=columns_maps,
 9909                    added_columns=added_columns,
 9910                    temporary_tables=temporary_tables,
 9911                    annotation_fields=annotation_fields,
 9912                )
 9913            )
 9914            added_columns += added_columns_tmp
 9915            temporary_tables += temporary_tables_tmp
 9916            annotation_fields += annotation_fields_tmp
 9917
 9918            # from column format
 9919            column_formats = struct.get("from_column_format", [])
 9920            temporary_tables_tmp, annotation_fields_tmp = (
 9921                self.create_transcript_view_from_column_format(
 9922                    transcripts_table=transcripts_table,
 9923                    column_formats=column_formats,
 9924                    temporary_tables=temporary_tables,
 9925                    annotation_fields=annotation_fields,
 9926                )
 9927            )
 9928            temporary_tables += temporary_tables_tmp
 9929            annotation_fields += annotation_fields_tmp
 9930
 9931            # Merge temporary tables query
 9932            query_merge = ""
 9933            for temporary_table in temporary_tables:
 9934
 9935                # First temporary table
 9936                if not query_merge:
 9937                    query_merge = f"""
 9938                        SELECT * FROM {temporary_table}
 9939                    """
 9940                # other temporary table (using UNION)
 9941                else:
 9942                    query_merge += f"""
 9943                        UNION BY NAME SELECT * FROM {temporary_table}
 9944                    """
 9945
 9946            # Merge on transcript
 9947            query_merge_on_transcripts_annotation_fields = []
 9948            # Aggregate all annotations fields
 9949            for annotation_field in set(annotation_fields):
 9950                query_merge_on_transcripts_annotation_fields.append(
 9951                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
 9952                )
 9953            # Query for transcripts view
 9954            query_merge_on_transcripts = f"""
 9955                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
 9956                FROM ({query_merge})
 9957                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
 9958            """
 9959
 9960            # Drop transcript view is necessary
 9961            if transcripts_table_drop:
 9962                query_drop = f"""
 9963                    DROP TABLE IF EXISTS {transcripts_table};
 9964                """
 9965                self.execute_query(query=query_drop)
 9966
 9967            # Merge and create transcript view
 9968            query_create_view = f"""
 9969                CREATE TABLE IF NOT EXISTS {transcripts_table}
 9970                AS {query_merge_on_transcripts}
 9971            """
 9972            self.execute_query(query=query_create_view)
 9973
 9974            # Remove added columns
 9975            for added_column in added_columns:
 9976                self.drop_column(column=added_column)
 9977
 9978        else:
 9979
 9980            transcripts_table = None
 9981
 9982        return transcripts_table
 9983
 9984    def annotation_format_to_table(
 9985        self,
 9986        uniquify: bool = True,
 9987        annotation_field: str = "ANN",
 9988        annotation_id: str = "Feature_ID",
 9989        view_name: str = "transcripts",
 9990    ) -> str:
 9991        """
 9992        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9993        table format.
 9994
 9995        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9996        values in the output or not. If set to `True`, the function will make sure that the output values
 9997        are unique, defaults to True
 9998        :type uniquify: bool (optional)
 9999        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10000        contains the annotation information for each variant. This field is used to extract the annotation
10001        details for further processing in the function, defaults to ANN
10002        :type annotation_field: str (optional)
10003        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10004        used to specify the identifier for the annotation feature. This identifier will be used as a column
10005        name in the resulting table or view that is created based on the annotation data. It helps in
10006        uniquely identifying each annotation entry in the, defaults to Feature_ID
10007        :type annotation_id: str (optional)
10008        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10009        specify the name of the temporary table that will be created to store the transformed annotation
10010        data. This table will hold the extracted information from the annotation field in a structured
10011        format for further processing or analysis, defaults to transcripts
10012        :type view_name: str (optional)
10013        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10014        is stored in the variable `view_name`.
10015        """
10016
10017        # Annotation field
10018        annotation_format = "annotation_explode"
10019
10020        # Transcript annotation
10021        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10022
10023        # Prefix
10024        prefix = self.get_explode_infos_prefix()
10025        if prefix:
10026            prefix = "INFO/"
10027
10028        # Annotation fields
10029        annotation_infos = prefix + annotation_field
10030        annotation_format_infos = prefix + annotation_format
10031
10032        # Variants table
10033        table_variants = self.get_table_variants()
10034
10035        # Header
10036        vcf_reader = self.get_header()
10037
10038        # Add columns
10039        added_columns = []
10040
10041        # Explode HGVS field in column
10042        added_columns += self.explode_infos(fields=[annotation_field])
10043
10044        if annotation_field in vcf_reader.infos:
10045
10046            # Extract ANN header
10047            ann_description = vcf_reader.infos[annotation_field].desc
10048            pattern = r"'(.+?)'"
10049            match = re.search(pattern, ann_description)
10050            if match:
10051                ann_header_match = match.group(1).split(" | ")
10052                ann_header = []
10053                ann_header_desc = {}
10054                for i in range(len(ann_header_match)):
10055                    ann_header_info = "".join(
10056                        char for char in ann_header_match[i] if char.isalnum()
10057                    )
10058                    ann_header.append(ann_header_info)
10059                    ann_header_desc[ann_header_info] = ann_header_match[i]
10060                if not ann_header_desc:
10061                    raise ValueError("Invalid header description format")
10062            else:
10063                raise ValueError("Invalid header description format")
10064
10065            # Create variant id
10066            variant_id_column = self.get_variant_id_column()
10067            added_columns += [variant_id_column]
10068
10069            # Create dataframe
10070            dataframe_annotation_format = self.get_query_to_df(
10071                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10072            )
10073
10074            # Create annotation columns
10075            dataframe_annotation_format[
10076                annotation_format_infos
10077            ] = dataframe_annotation_format[annotation_infos].apply(
10078                lambda x: explode_annotation_format(
10079                    annotation=str(x),
10080                    uniquify=uniquify,
10081                    output_format="JSON",
10082                    prefix="",
10083                    header=list(ann_header_desc.values()),
10084                )
10085            )
10086
10087            # Find keys
10088            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10089            df_keys = self.get_query_to_df(query=query_json)
10090
10091            # Check keys
10092            query_json_key = []
10093            for _, row in df_keys.iterrows():
10094
10095                # Key
10096                key = row.iloc[0]
10097
10098                # key_clean
10099                key_clean = "".join(char for char in key if char.isalnum())
10100
10101                # Type
10102                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10103
10104                # Get DataFrame from query
10105                df_json_type = self.get_query_to_df(query=query_json_type)
10106
10107                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10108                with pd.option_context("future.no_silent_downcasting", True):
10109                    df_json_type.fillna(value="", inplace=True)
10110                    replace_dict = {None: np.nan, "": np.nan}
10111                    df_json_type.replace(replace_dict, inplace=True)
10112                    df_json_type.dropna(inplace=True)
10113
10114                # Detect column type
10115                column_type = detect_column_type(df_json_type[key_clean])
10116
10117                # Append
10118                query_json_key.append(
10119                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10120                )
10121
10122            # Create view
10123            query_view = f"""
10124                CREATE TEMPORARY TABLE {view_name}
10125                AS (
10126                    SELECT *, {annotation_id} AS 'transcript'
10127                    FROM (
10128                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10129                        FROM dataframe_annotation_format
10130                        )
10131                    );
10132            """
10133            self.execute_query(query=query_view)
10134
10135        else:
10136
10137            # Return None
10138            view_name = None
10139
10140        # Remove added columns
10141        for added_column in added_columns:
10142            self.drop_column(column=added_column)
10143
10144        return view_name
10145
10146    def transcript_view_to_variants(
10147        self,
10148        transcripts_table: str = None,
10149        transcripts_column_id: str = None,
10150        transcripts_info_json: str = None,
10151        transcripts_info_field_json: str = None,
10152        transcripts_info_format: str = None,
10153        transcripts_info_field_format: str = None,
10154        param: dict = {},
10155    ) -> bool:
10156        """
10157        The `transcript_view_to_variants` function updates a variants table with information from
10158        transcripts in JSON format.
10159
10160        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10161        table containing the transcripts data. If this parameter is not provided, the function will
10162        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10163        :type transcripts_table: str
10164        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10165        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10166        identifier is used to match transcripts with variants in the database
10167        :type transcripts_column_id: str
10168        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10169        of the column in the variants table where the transcripts information will be stored in JSON
10170        format. This parameter allows you to define the column in the variants table that will hold the
10171        JSON-formatted information about transcripts
10172        :type transcripts_info_json: str
10173        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10174        specify the field in the VCF header that will contain information about transcripts in JSON
10175        format. This field will be added to the VCF header as an INFO field with the specified name
10176        :type transcripts_info_field_json: str
10177        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10178        format of the information about transcripts that will be stored in the variants table. This
10179        format can be used to define how the transcript information will be structured or displayed
10180        within the variants table
10181        :type transcripts_info_format: str
10182        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10183        specify the field in the VCF header that will contain information about transcripts in a
10184        specific format. This field will be added to the VCF header as an INFO field with the specified
10185        name
10186        :type transcripts_info_field_format: str
10187        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10188        that contains various configuration settings related to transcripts. It is used to provide
10189        default values for certain parameters if they are not explicitly provided when calling the
10190        method. The `param` dictionary can be passed as an argument
10191        :type param: dict
10192        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10193        if the operation is successful and `False` if certain conditions are not met.
10194        """
10195
10196        msg_info_prefix = "Start transcripts view to variants annotations"
10197
10198        log.debug(f"{msg_info_prefix}...")
10199
10200        # Default
10201        transcripts_table_default = "transcripts"
10202        transcripts_column_id_default = "transcript"
10203        transcripts_info_json_default = None
10204        transcripts_info_format_default = None
10205        transcripts_info_field_json_default = None
10206        transcripts_info_field_format_default = None
10207
10208        # Param
10209        if not param:
10210            param = self.get_param()
10211
10212        # Transcripts table
10213        if transcripts_table is None:
10214            transcripts_table = param.get("transcripts", {}).get(
10215                "table", transcripts_table_default
10216            )
10217
10218        # Transcripts column ID
10219        if transcripts_column_id is None:
10220            transcripts_column_id = param.get("transcripts", {}).get(
10221                "column_id", transcripts_column_id_default
10222            )
10223
10224        # Transcripts info json
10225        if transcripts_info_json is None:
10226            transcripts_info_json = param.get("transcripts", {}).get(
10227                "transcripts_info_json", transcripts_info_json_default
10228            )
10229
10230        # Transcripts info field JSON
10231        if transcripts_info_field_json is None:
10232            transcripts_info_field_json = param.get("transcripts", {}).get(
10233                "transcripts_info_field_json", transcripts_info_field_json_default
10234            )
10235        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10236        #     transcripts_info_json = transcripts_info_field_json
10237
10238        # Transcripts info format
10239        if transcripts_info_format is None:
10240            transcripts_info_format = param.get("transcripts", {}).get(
10241                "transcripts_info_format", transcripts_info_format_default
10242            )
10243
10244        # Transcripts info field FORMAT
10245        if transcripts_info_field_format is None:
10246            transcripts_info_field_format = param.get("transcripts", {}).get(
10247                "transcripts_info_field_format", transcripts_info_field_format_default
10248            )
10249        # if (
10250        #     transcripts_info_field_format is not None
10251        #     and transcripts_info_format is None
10252        # ):
10253        #     transcripts_info_format = transcripts_info_field_format
10254
10255        # Variants table
10256        table_variants = self.get_table_variants()
10257
10258        # Check info columns param
10259        if (
10260            transcripts_info_json is None
10261            and transcripts_info_field_json is None
10262            and transcripts_info_format is None
10263            and transcripts_info_field_format is None
10264        ):
10265            return False
10266
10267        # Transcripts infos columns
10268        query_transcripts_infos_columns = f"""
10269            SELECT *
10270            FROM (
10271                DESCRIBE SELECT * FROM {transcripts_table}
10272                )
10273            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10274        """
10275        transcripts_infos_columns = list(
10276            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10277        )
10278
10279        # View results
10280        clause_select = []
10281        clause_to_json = []
10282        clause_to_format = []
10283        for field in transcripts_infos_columns:
10284            clause_select.append(
10285                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10286            )
10287            clause_to_json.append(f""" '{field}': "{field}" """)
10288            clause_to_format.append(f""" "{field}" """)
10289
10290        # Update
10291        update_set_json = []
10292        update_set_format = []
10293
10294        # VCF header
10295        vcf_reader = self.get_header()
10296
10297        # Transcripts to info column in JSON
10298        if transcripts_info_json is not None:
10299
10300            # Create column on variants table
10301            self.add_column(
10302                table_name=table_variants,
10303                column_name=transcripts_info_json,
10304                column_type="JSON",
10305                default_value=None,
10306                drop=False,
10307            )
10308
10309            # Add header
10310            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10311                transcripts_info_json,
10312                ".",
10313                "String",
10314                "Transcripts in JSON format",
10315                "unknwon",
10316                "unknwon",
10317                self.code_type_map["String"],
10318            )
10319
10320            # Add to update
10321            update_set_json.append(
10322                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10323            )
10324
10325        # Transcripts to info field in JSON
10326        if transcripts_info_field_json is not None:
10327
10328            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10329
10330            # Add to update
10331            update_set_json.append(
10332                f""" 
10333                    INFO = concat(
10334                            CASE
10335                                WHEN INFO NOT IN ('', '.')
10336                                THEN INFO
10337                                ELSE ''
10338                            END,
10339                            CASE
10340                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10341                                THEN concat(
10342                                    ';{transcripts_info_field_json}=',
10343                                    t.{transcripts_info_json}
10344                                )
10345                                ELSE ''
10346                            END
10347                            )
10348                """
10349            )
10350
10351            # Add header
10352            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10353                transcripts_info_field_json,
10354                ".",
10355                "String",
10356                "Transcripts in JSON format",
10357                "unknwon",
10358                "unknwon",
10359                self.code_type_map["String"],
10360            )
10361
10362        if update_set_json:
10363
10364            # Update query
10365            query_update = f"""
10366                UPDATE {table_variants}
10367                    SET {", ".join(update_set_json)}
10368                FROM
10369                (
10370                    SELECT
10371                        "#CHROM", POS, REF, ALT,
10372                            concat(
10373                            '{{',
10374                            string_agg(
10375                                '"' || "{transcripts_column_id}" || '":' ||
10376                                to_json(json_output)
10377                            ),
10378                            '}}'
10379                            )::JSON AS {transcripts_info_json}
10380                    FROM
10381                        (
10382                        SELECT
10383                            "#CHROM", POS, REF, ALT,
10384                            "{transcripts_column_id}",
10385                            to_json(
10386                                {{{",".join(clause_to_json)}}}
10387                            )::JSON AS json_output
10388                        FROM
10389                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10390                        WHERE "{transcripts_column_id}" IS NOT NULL
10391                        )
10392                    GROUP BY "#CHROM", POS, REF, ALT
10393                ) AS t
10394                WHERE {table_variants}."#CHROM" = t."#CHROM"
10395                    AND {table_variants}."POS" = t."POS"
10396                    AND {table_variants}."REF" = t."REF"
10397                    AND {table_variants}."ALT" = t."ALT"
10398            """
10399
10400            self.execute_query(query=query_update)
10401
10402        # Transcripts to info column in FORMAT
10403        if transcripts_info_format is not None:
10404
10405            # Create column on variants table
10406            self.add_column(
10407                table_name=table_variants,
10408                column_name=transcripts_info_format,
10409                column_type="VARCHAR",
10410                default_value=None,
10411                drop=False,
10412            )
10413
10414            # Add header
10415            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10416                transcripts_info_format,
10417                ".",
10418                "String",
10419                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10420                "unknwon",
10421                "unknwon",
10422                self.code_type_map["String"],
10423            )
10424
10425            # Add to update
10426            update_set_format.append(
10427                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10428            )
10429
10430        # Transcripts to info field in JSON
10431        if transcripts_info_field_format is not None:
10432
10433            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10434
10435            # Add to update
10436            update_set_format.append(
10437                f""" 
10438                    INFO = concat(
10439                            CASE
10440                                WHEN INFO NOT IN ('', '.')
10441                                THEN INFO
10442                                ELSE ''
10443                            END,
10444                            CASE
10445                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10446                                THEN concat(
10447                                    ';{transcripts_info_field_format}=',
10448                                    t.{transcripts_info_format}
10449                                )
10450                                ELSE ''
10451                            END
10452                            )
10453                """
10454            )
10455
10456            # Add header
10457            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10458                transcripts_info_field_format,
10459                ".",
10460                "String",
10461                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10462                "unknwon",
10463                "unknwon",
10464                self.code_type_map["String"],
10465            )
10466
10467        if update_set_format:
10468
10469            # Update query
10470            query_update = f"""
10471                UPDATE {table_variants}
10472                    SET {", ".join(update_set_format)}
10473                FROM
10474                (
10475                    SELECT
10476                        "#CHROM", POS, REF, ALT,
10477                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10478                    FROM 
10479                        (
10480                        SELECT
10481                            "#CHROM", POS, REF, ALT,
10482                            "{transcripts_column_id}",
10483                            concat(
10484                                "{transcripts_column_id}",
10485                                '|',
10486                                {", '|', ".join(clause_to_format)}
10487                            ) AS {transcripts_info_format}
10488                        FROM
10489                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10490                        )
10491                    GROUP BY "#CHROM", POS, REF, ALT
10492                ) AS t
10493                WHERE {table_variants}."#CHROM" = t."#CHROM"
10494                    AND {table_variants}."POS" = t."POS"
10495                    AND {table_variants}."REF" = t."REF"
10496                    AND {table_variants}."ALT" = t."ALT"
10497            """
10498
10499            self.execute_query(query=query_update)
10500
10501        return True
class Variants:
   34class Variants:
   35
   36    def __init__(
   37        self,
   38        conn=None,
   39        input: str = None,
   40        output: str = None,
   41        config: dict = {},
   42        param: dict = {},
   43        load: bool = False,
   44    ) -> None:
   45        """
   46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   47        header
   48
   49        :param conn: the connection to the database
   50        :param input: the input file
   51        :param output: the output file
   52        :param config: a dictionary containing the configuration of the model
   53        :param param: a dictionary containing the parameters of the model
   54        """
   55
   56        # Init variables
   57        self.init_variables()
   58
   59        # Input
   60        self.set_input(input)
   61
   62        # Config
   63        self.set_config(config)
   64
   65        # Param
   66        self.set_param(param)
   67
   68        # Output
   69        self.set_output(output)
   70
   71        # connexion
   72        self.set_connexion(conn)
   73
   74        # Header
   75        self.set_header()
   76
   77        # Samples
   78        self.set_samples()
   79
   80        # Load data
   81        if load:
   82            self.load_data()
   83
   84    def set_samples(self, samples: list = None) -> list:
   85        """
   86        The function `set_samples` sets the samples attribute of an object to a provided list or
   87        retrieves it from a parameter dictionary.
   88
   89        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   90        input and sets the `samples` attribute of the class to the provided list. If no samples are
   91        provided, it tries to get the samples from the class's parameters using the `get_param` method
   92        :type samples: list
   93        :return: The `samples` list is being returned.
   94        """
   95
   96        if not samples:
   97            samples = self.get_param().get("samples", {}).get("list", None)
   98
   99        self.samples = samples
  100
  101        return samples
  102
  103    def get_samples(self) -> list:
  104        """
  105        This function returns a list of samples.
  106        :return: The `get_samples` method is returning the `samples` attribute of the object.
  107        """
  108
  109        return self.samples
  110
  111    def get_samples_check(self) -> bool:
  112        """
  113        This function returns the value of the "check" key within the "samples" dictionary retrieved
  114        from the parameters.
  115        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  116        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  117        method. If the key "check" is not found, it will return `False`.
  118        """
  119
  120        return self.get_param().get("samples", {}).get("check", True)
  121
  122    def set_input(self, input: str = None) -> None:
  123        """
  124        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  125        attributes in the class accordingly.
  126
  127        :param input: The `set_input` method in the provided code snippet is used to set attributes
  128        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  129        :type input: str
  130        """
  131
  132        if input and not isinstance(input, str):
  133            try:
  134                self.input = input.name
  135            except:
  136                log.error(f"Input file '{input} in bad format")
  137                raise ValueError(f"Input file '{input} in bad format")
  138        else:
  139            self.input = input
  140
  141        # Input format
  142        if input:
  143            input_name, input_extension = os.path.splitext(self.input)
  144            self.input_name = input_name
  145            self.input_extension = input_extension
  146            self.input_format = self.input_extension.replace(".", "")
  147
  148    def set_config(self, config: dict) -> None:
  149        """
  150        The set_config function takes a config object and assigns it as the configuration object for the
  151        class.
  152
  153        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  154        contains configuration settings for the class. When you call the `set_config` function with a
  155        dictionary object as the argument, it will set that dictionary as the configuration object for
  156        the class
  157        :type config: dict
  158        """
  159
  160        self.config = config
  161
  162    def set_param(self, param: dict) -> None:
  163        """
  164        This function sets a parameter object for the class based on the input dictionary.
  165
  166        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  167        as the `param` attribute of the class instance
  168        :type param: dict
  169        """
  170
  171        self.param = param
  172
  173    def init_variables(self) -> None:
  174        """
  175        This function initializes the variables that will be used in the rest of the class
  176        """
  177
  178        self.prefix = "howard"
  179        self.table_variants = "variants"
  180        self.dataframe = None
  181
  182        self.comparison_map = {
  183            "gt": ">",
  184            "gte": ">=",
  185            "lt": "<",
  186            "lte": "<=",
  187            "equals": "=",
  188            "contains": "SIMILAR TO",
  189        }
  190
  191        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  192
  193        self.code_type_map_to_sql = {
  194            "Integer": "INTEGER",
  195            "String": "VARCHAR",
  196            "Float": "FLOAT",
  197            "Flag": "VARCHAR",
  198        }
  199
  200        self.index_additionnal_fields = []
  201
  202    def get_indexing(self) -> bool:
  203        """
  204        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  205        returns False.
  206        :return: The value of the indexing parameter.
  207        """
  208
  209        return self.get_param().get("indexing", False)
  210
  211    def get_connexion_config(self) -> dict:
  212        """
  213        The function `get_connexion_config` returns a dictionary containing the configuration for a
  214        connection, including the number of threads and memory limit.
  215        :return: a dictionary containing the configuration for the Connexion library.
  216        """
  217
  218        # config
  219        config = self.get_config()
  220
  221        # Connexion config
  222        connexion_config = {}
  223        threads = self.get_threads()
  224
  225        # Threads
  226        if threads:
  227            connexion_config["threads"] = threads
  228
  229        # Memory
  230        # if config.get("memory", None):
  231        #     connexion_config["memory_limit"] = config.get("memory")
  232        if self.get_memory():
  233            connexion_config["memory_limit"] = self.get_memory()
  234
  235        # Temporary directory
  236        if config.get("tmp", None):
  237            connexion_config["temp_directory"] = config.get("tmp")
  238
  239        # Access
  240        if config.get("access", None):
  241            access = config.get("access")
  242            if access in ["RO"]:
  243                access = "READ_ONLY"
  244            elif access in ["RW"]:
  245                access = "READ_WRITE"
  246            connexion_db = self.get_connexion_db()
  247            if connexion_db in ":memory:":
  248                access = "READ_WRITE"
  249            connexion_config["access_mode"] = access
  250
  251        return connexion_config
  252
  253    def get_duckdb_settings(self) -> dict:
  254        """
  255        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  256        string.
  257        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  258        """
  259
  260        # config
  261        config = self.get_config()
  262
  263        # duckdb settings
  264        duckdb_settings_dict = {}
  265        if config.get("duckdb_settings", None):
  266            duckdb_settings = config.get("duckdb_settings")
  267            duckdb_settings = full_path(duckdb_settings)
  268            # duckdb setting is a file
  269            if os.path.exists(duckdb_settings):
  270                with open(duckdb_settings) as json_file:
  271                    duckdb_settings_dict = yaml.safe_load(json_file)
  272            # duckdb settings is a string
  273            else:
  274                duckdb_settings_dict = json.loads(duckdb_settings)
  275
  276        return duckdb_settings_dict
  277
  278    def set_connexion_db(self) -> str:
  279        """
  280        The function `set_connexion_db` returns the appropriate database connection string based on the
  281        input format and connection type.
  282        :return: the value of the variable `connexion_db`.
  283        """
  284
  285        # Default connexion db
  286        default_connexion_db = ":memory:"
  287
  288        # Find connexion db
  289        if self.get_input_format() in ["db", "duckdb"]:
  290            connexion_db = self.get_input()
  291        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  292            connexion_db = default_connexion_db
  293        elif self.get_connexion_type() in ["tmpfile"]:
  294            tmp_name = tempfile.mkdtemp(
  295                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  296            )
  297            connexion_db = f"{tmp_name}/tmp.db"
  298        elif self.get_connexion_type() != "":
  299            connexion_db = self.get_connexion_type()
  300        else:
  301            connexion_db = default_connexion_db
  302
  303        # Set connexion db
  304        self.connexion_db = connexion_db
  305
  306        return connexion_db
  307
  308    def set_connexion(self, conn) -> None:
  309        """
  310        The function `set_connexion` creates a connection to a database, with options for different
  311        database formats and settings.
  312
  313        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  314        database. If a connection is not provided, a new connection to an in-memory database is created.
  315        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  316        sqlite
  317        """
  318
  319        # Connexion db
  320        connexion_db = self.set_connexion_db()
  321
  322        # Connexion config
  323        connexion_config = self.get_connexion_config()
  324
  325        # Connexion format
  326        connexion_format = self.get_config().get("connexion_format", "duckdb")
  327        # Set connexion format
  328        self.connexion_format = connexion_format
  329
  330        # Connexion
  331        if not conn:
  332            if connexion_format in ["duckdb"]:
  333                conn = duckdb.connect(connexion_db, config=connexion_config)
  334                # duckDB settings
  335                duckdb_settings = self.get_duckdb_settings()
  336                if duckdb_settings:
  337                    for setting in duckdb_settings:
  338                        setting_value = duckdb_settings.get(setting)
  339                        if isinstance(setting_value, str):
  340                            setting_value = f"'{setting_value}'"
  341                        conn.execute(f"PRAGMA {setting}={setting_value};")
  342            elif connexion_format in ["sqlite"]:
  343                conn = sqlite3.connect(connexion_db)
  344
  345        # Set connexion
  346        self.conn = conn
  347
  348        # Log
  349        log.debug(f"connexion_format: {connexion_format}")
  350        log.debug(f"connexion_db: {connexion_db}")
  351        log.debug(f"connexion config: {connexion_config}")
  352        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  353
  354    def set_output(self, output: str = None) -> None:
  355        """
  356        The `set_output` function in Python sets the output file based on the input or a specified key
  357        in the config file, extracting the output name, extension, and format.
  358
  359        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  360        the output file. If the config file has an 'output' key, the method sets the output to the value
  361        of that key. If no output is provided, it sets the output to `None`
  362        :type output: str
  363        """
  364
  365        if output and not isinstance(output, str):
  366            self.output = output.name
  367        else:
  368            self.output = output
  369
  370        # Output format
  371        if self.output:
  372            output_name, output_extension = os.path.splitext(self.output)
  373            self.output_name = output_name
  374            self.output_extension = output_extension
  375            self.output_format = self.output_extension.replace(".", "")
  376        else:
  377            self.output_name = None
  378            self.output_extension = None
  379            self.output_format = None
  380
  381    def set_header(self) -> None:
  382        """
  383        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  384        """
  385
  386        input_file = self.get_input()
  387        default_header_list = [
  388            "##fileformat=VCFv4.2",
  389            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  390        ]
  391
  392        # Full path
  393        input_file = full_path(input_file)
  394
  395        if input_file:
  396
  397            input_format = self.get_input_format()
  398            input_compressed = self.get_input_compressed()
  399            config = self.get_config()
  400            header_list = default_header_list
  401            if input_format in [
  402                "vcf",
  403                "hdr",
  404                "tsv",
  405                "csv",
  406                "psv",
  407                "parquet",
  408                "db",
  409                "duckdb",
  410            ]:
  411                # header provided in param
  412                if config.get("header_file", None):
  413                    with open(config.get("header_file"), "rt") as f:
  414                        header_list = self.read_vcf_header(f)
  415                # within a vcf file format (header within input file itsself)
  416                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  417                    # within a compressed vcf file format (.vcf.gz)
  418                    if input_compressed:
  419                        with bgzf.open(input_file, "rt") as f:
  420                            header_list = self.read_vcf_header(f)
  421                    # within an uncompressed vcf file format (.vcf)
  422                    else:
  423                        with open(input_file, "rt") as f:
  424                            header_list = self.read_vcf_header(f)
  425                # header provided in default external file .hdr
  426                elif os.path.exists((input_file + ".hdr")):
  427                    with open(input_file + ".hdr", "rt") as f:
  428                        header_list = self.read_vcf_header(f)
  429                else:
  430                    try:  # Try to get header info fields and file columns
  431
  432                        with tempfile.TemporaryDirectory() as tmpdir:
  433
  434                            # Create database
  435                            db_for_header = Database(database=input_file)
  436
  437                            # Get header columns for infos fields
  438                            db_header_from_columns = (
  439                                db_for_header.get_header_from_columns()
  440                            )
  441
  442                            # Get real columns in the file
  443                            db_header_columns = db_for_header.get_columns()
  444
  445                            # Write header file
  446                            header_file_tmp = os.path.join(tmpdir, "header")
  447                            f = open(header_file_tmp, "w")
  448                            vcf.Writer(f, db_header_from_columns)
  449                            f.close()
  450
  451                            # Replace #CHROM line with rel columns
  452                            header_list = db_for_header.read_header_file(
  453                                header_file=header_file_tmp
  454                            )
  455                            header_list[-1] = "\t".join(db_header_columns)
  456
  457                    except:
  458
  459                        log.warning(
  460                            f"No header for file {input_file}. Set as default VCF header"
  461                        )
  462                        header_list = default_header_list
  463
  464            else:  # try for unknown format ?
  465
  466                log.error(f"Input file format '{input_format}' not available")
  467                raise ValueError(f"Input file format '{input_format}' not available")
  468
  469            if not header_list:
  470                header_list = default_header_list
  471
  472            # header as list
  473            self.header_list = header_list
  474
  475            # header as VCF object
  476            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  477
  478        else:
  479
  480            self.header_list = None
  481            self.header_vcf = None
  482
  483    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  484        """
  485        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  486        DataFrame based on the connection format.
  487
  488        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  489        represents the SQL query you want to execute. This query will be used to fetch data from a
  490        database and convert it into a pandas DataFrame
  491        :type query: str
  492        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  493        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  494        function will only fetch up to that number of rows from the database query result. If no limit
  495        is specified,
  496        :type limit: int
  497        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  498        """
  499
  500        # Connexion format
  501        connexion_format = self.get_connexion_format()
  502
  503        # Limit in query
  504        if limit:
  505            pd.set_option("display.max_rows", limit)
  506            if connexion_format in ["duckdb"]:
  507                df = (
  508                    self.conn.execute(query)
  509                    .fetch_record_batch(limit)
  510                    .read_next_batch()
  511                    .to_pandas()
  512                )
  513            elif connexion_format in ["sqlite"]:
  514                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  515
  516        # Full query
  517        else:
  518            if connexion_format in ["duckdb"]:
  519                df = self.conn.execute(query).df()
  520            elif connexion_format in ["sqlite"]:
  521                df = pd.read_sql_query(query, self.conn)
  522
  523        return df
  524
  525    def get_overview(self) -> None:
  526        """
  527        The function prints the input, output, config, and dataframe of the current object
  528        """
  529        table_variants_from = self.get_table_variants(clause="from")
  530        sql_columns = self.get_header_columns_as_sql()
  531        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  532        df = self.get_query_to_df(sql_query_export)
  533        log.info(
  534            "Input:  "
  535            + str(self.get_input())
  536            + " ["
  537            + str(str(self.get_input_format()))
  538            + "]"
  539        )
  540        log.info(
  541            "Output: "
  542            + str(self.get_output())
  543            + " ["
  544            + str(str(self.get_output_format()))
  545            + "]"
  546        )
  547        log.info("Config: ")
  548        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  549            "\n"
  550        ):
  551            log.info("\t" + str(d))
  552        log.info("Param: ")
  553        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  554            "\n"
  555        ):
  556            log.info("\t" + str(d))
  557        log.info("Sample list: " + str(self.get_header_sample_list()))
  558        log.info("Dataframe: ")
  559        for d in str(df).split("\n"):
  560            log.info("\t" + str(d))
  561
  562        # garbage collector
  563        del df
  564        gc.collect()
  565
  566        return None
  567
  568    def get_stats(self) -> dict:
  569        """
  570        The `get_stats` function calculates and returns various statistics of the current object,
  571        including information about the input file, variants, samples, header fields, quality, and
  572        SNVs/InDels.
  573        :return: a dictionary containing various statistics of the current object. The dictionary has
  574        the following structure:
  575        """
  576
  577        # Log
  578        log.info(f"Stats Calculation...")
  579
  580        # table varaints
  581        table_variants_from = self.get_table_variants()
  582
  583        # stats dict
  584        stats = {"Infos": {}}
  585
  586        ### File
  587        input_file = self.get_input()
  588        stats["Infos"]["Input file"] = input_file
  589
  590        # Header
  591        header_infos = self.get_header().infos
  592        header_formats = self.get_header().formats
  593        header_infos_list = list(header_infos)
  594        header_formats_list = list(header_formats)
  595
  596        ### Variants
  597
  598        stats["Variants"] = {}
  599
  600        # Variants by chr
  601        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  602        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  603        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  604            by=["CHROM"], kind="quicksort"
  605        )
  606
  607        # Total number of variants
  608        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  609
  610        # Calculate percentage
  611        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  612            lambda x: (x / nb_of_variants)
  613        )
  614
  615        stats["Variants"]["Number of variants by chromosome"] = (
  616            nb_of_variants_by_chrom.to_dict(orient="index")
  617        )
  618
  619        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  620
  621        ### Samples
  622
  623        # Init
  624        samples = {}
  625        nb_of_samples = 0
  626
  627        # Check Samples
  628        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  629            log.debug(f"Check samples...")
  630            for sample in self.get_header_sample_list():
  631                sql_query_samples = f"""
  632                    SELECT  '{sample}' as sample,
  633                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  634                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  635                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  636                    FROM {table_variants_from}
  637                    WHERE (
  638                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  639                        AND
  640                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  641                      )
  642                    GROUP BY genotype
  643                    """
  644                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  645                sample_genotype_count = sql_query_genotype_df["count"].sum()
  646                if len(sql_query_genotype_df):
  647                    nb_of_samples += 1
  648                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  649                        sql_query_genotype_df.to_dict(orient="index")
  650                    )
  651
  652            stats["Samples"] = samples
  653            stats["Infos"]["Number of samples"] = nb_of_samples
  654
  655        # #
  656        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  657        #     stats["Infos"]["Number of samples"] = nb_of_samples
  658        # elif nb_of_samples:
  659        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  660
  661        ### INFO and FORMAT fields
  662        header_types_df = {}
  663        header_types_list = {
  664            "List of INFO fields": header_infos,
  665            "List of FORMAT fields": header_formats,
  666        }
  667        i = 0
  668        for header_type in header_types_list:
  669
  670            header_type_infos = header_types_list.get(header_type)
  671            header_infos_dict = {}
  672
  673            for info in header_type_infos:
  674
  675                i += 1
  676                header_infos_dict[i] = {}
  677
  678                # ID
  679                header_infos_dict[i]["id"] = info
  680
  681                # num
  682                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  683                if header_type_infos[info].num in genotype_map.keys():
  684                    header_infos_dict[i]["Number"] = genotype_map.get(
  685                        header_type_infos[info].num
  686                    )
  687                else:
  688                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  689
  690                # type
  691                if header_type_infos[info].type:
  692                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  693                else:
  694                    header_infos_dict[i]["Type"] = "."
  695
  696                # desc
  697                if header_type_infos[info].desc != None:
  698                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  699                else:
  700                    header_infos_dict[i]["Description"] = ""
  701
  702            if len(header_infos_dict):
  703                header_types_df[header_type] = pd.DataFrame.from_dict(
  704                    header_infos_dict, orient="index"
  705                ).to_dict(orient="index")
  706
  707        # Stats
  708        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  709        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  710        stats["Header"] = header_types_df
  711
  712        ### QUAL
  713        if "QUAL" in self.get_header_columns():
  714            sql_query_qual = f"""
  715                    SELECT
  716                        avg(CAST(QUAL AS INTEGER)) AS Average,
  717                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  718                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  719                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  720                        median(CAST(QUAL AS INTEGER)) AS Median,
  721                        variance(CAST(QUAL AS INTEGER)) AS Variance
  722                    FROM {table_variants_from}
  723                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  724                    """
  725
  726            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  727            stats["Quality"] = {"Stats": qual}
  728
  729        ### SNV and InDel
  730
  731        sql_query_snv = f"""
  732            
  733            SELECT Type, count FROM (
  734
  735                    SELECT
  736                        'Total' AS Type,
  737                        count(*) AS count
  738                    FROM {table_variants_from}
  739
  740                    UNION
  741
  742                    SELECT
  743                        'MNV' AS Type,
  744                        count(*) AS count
  745                    FROM {table_variants_from}
  746                    WHERE len(REF) > 1 AND len(ALT) > 1
  747                    AND len(REF) = len(ALT)
  748
  749                    UNION
  750
  751                    SELECT
  752                        'InDel' AS Type,
  753                        count(*) AS count
  754                    FROM {table_variants_from}
  755                    WHERE len(REF) > 1 OR len(ALT) > 1
  756                    AND len(REF) != len(ALT)
  757                    
  758                    UNION
  759
  760                    SELECT
  761                        'SNV' AS Type,
  762                        count(*) AS count
  763                    FROM {table_variants_from}
  764                    WHERE len(REF) = 1 AND len(ALT) = 1
  765
  766                )
  767
  768            ORDER BY count DESC
  769
  770                """
  771        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  772
  773        sql_query_snv_substitution = f"""
  774                SELECT
  775                    concat(REF, '>', ALT) AS 'Substitution',
  776                    count(*) AS count
  777                FROM {table_variants_from}
  778                WHERE len(REF) = 1 AND len(ALT) = 1
  779                GROUP BY REF, ALT
  780                ORDER BY count(*) DESC
  781                """
  782        snv_substitution = (
  783            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  784        )
  785        stats["Variants"]["Counts"] = snv_indel
  786        stats["Variants"]["Substitutions"] = snv_substitution
  787
  788        return stats
  789
  790    def stats_to_file(self, file: str = None) -> str:
  791        """
  792        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  793        into a JSON object, and writes the JSON object to the specified file.
  794
  795        :param file: The `file` parameter is a string that represents the file path where the JSON data
  796        will be written
  797        :type file: str
  798        :return: the name of the file that was written to.
  799        """
  800
  801        # Get stats
  802        stats = self.get_stats()
  803
  804        # Serializing json
  805        json_object = json.dumps(stats, indent=4)
  806
  807        # Writing to sample.json
  808        with open(file, "w") as outfile:
  809            outfile.write(json_object)
  810
  811        return file
  812
  813    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  814        """
  815        The `print_stats` function generates a markdown file and prints the statistics contained in a
  816        JSON file in a formatted manner.
  817
  818        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  819        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  820        provided, a temporary directory will be created and the stats will be saved in a file named
  821        "stats.md" within that
  822        :type output_file: str
  823        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  824        file where the statistics will be saved. If no value is provided, a temporary directory will be
  825        created and a default file name "stats.json" will be used
  826        :type json_file: str
  827        :return: The function `print_stats` does not return any value. It has a return type annotation
  828        of `None`.
  829        """
  830
  831        # Full path
  832        output_file = full_path(output_file)
  833        json_file = full_path(json_file)
  834
  835        with tempfile.TemporaryDirectory() as tmpdir:
  836
  837            # Files
  838            if not output_file:
  839                output_file = os.path.join(tmpdir, "stats.md")
  840            if not json_file:
  841                json_file = os.path.join(tmpdir, "stats.json")
  842
  843            # Create folders
  844            if not os.path.exists(os.path.dirname(output_file)):
  845                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  846            if not os.path.exists(os.path.dirname(json_file)):
  847                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  848
  849            # Create stats JSON file
  850            stats_file = self.stats_to_file(file=json_file)
  851
  852            # Print stats file
  853            with open(stats_file) as f:
  854                stats = yaml.safe_load(f)
  855
  856            # Output
  857            output_title = []
  858            output_index = []
  859            output = []
  860
  861            # Title
  862            output_title.append("# HOWARD Stats")
  863
  864            # Index
  865            output_index.append("## Index")
  866
  867            # Process sections
  868            for section in stats:
  869                infos = stats.get(section)
  870                section_link = "#" + section.lower().replace(" ", "-")
  871                output.append(f"## {section}")
  872                output_index.append(f"- [{section}]({section_link})")
  873
  874                if len(infos):
  875                    for info in infos:
  876                        try:
  877                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  878                            is_df = True
  879                        except:
  880                            try:
  881                                df = pd.DataFrame.from_dict(
  882                                    json.loads((infos.get(info))), orient="index"
  883                                )
  884                                is_df = True
  885                            except:
  886                                is_df = False
  887                        if is_df:
  888                            output.append(f"### {info}")
  889                            info_link = "#" + info.lower().replace(" ", "-")
  890                            output_index.append(f"   - [{info}]({info_link})")
  891                            output.append(f"{df.to_markdown(index=False)}")
  892                        else:
  893                            output.append(f"- {info}: {infos.get(info)}")
  894                else:
  895                    output.append(f"NA")
  896
  897            # Write stats in markdown file
  898            with open(output_file, "w") as fp:
  899                for item in output_title:
  900                    fp.write("%s\n" % item)
  901                for item in output_index:
  902                    fp.write("%s\n" % item)
  903                for item in output:
  904                    fp.write("%s\n" % item)
  905
  906            # Output stats in markdown
  907            print("")
  908            print("\n\n".join(output_title))
  909            print("")
  910            print("\n\n".join(output))
  911            print("")
  912
  913        return None
  914
  915    def get_input(self) -> str:
  916        """
  917        It returns the value of the input variable.
  918        :return: The input is being returned.
  919        """
  920        return self.input
  921
  922    def get_input_format(self, input_file: str = None) -> str:
  923        """
  924        This function returns the format of the input variable, either from the provided input file or
  925        by prompting for input.
  926
  927        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  928        represents the file path of the input file. If no `input_file` is provided when calling the
  929        method, it will default to `None`
  930        :type input_file: str
  931        :return: The format of the input variable is being returned.
  932        """
  933
  934        if not input_file:
  935            input_file = self.get_input()
  936        input_format = get_file_format(input_file)
  937        return input_format
  938
  939    def get_input_compressed(self, input_file: str = None) -> str:
  940        """
  941        The function `get_input_compressed` returns the format of the input variable after compressing
  942        it.
  943
  944        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  945        that represents the file path of the input file. If no `input_file` is provided when calling the
  946        method, it will default to `None` and the method will then call `self.get_input()` to
  947        :type input_file: str
  948        :return: The function `get_input_compressed` returns the compressed format of the input
  949        variable.
  950        """
  951
  952        if not input_file:
  953            input_file = self.get_input()
  954        input_compressed = get_file_compressed(input_file)
  955        return input_compressed
  956
  957    def get_output(self) -> str:
  958        """
  959        It returns the output of the neuron.
  960        :return: The output of the neural network.
  961        """
  962
  963        return self.output
  964
  965    def get_output_format(self, output_file: str = None) -> str:
  966        """
  967        The function `get_output_format` returns the format of the input variable or the output file if
  968        provided.
  969
  970        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  971        that represents the file path of the output file. If no `output_file` is provided when calling
  972        the method, it will default to the output obtained from the `get_output` method of the class
  973        instance. The
  974        :type output_file: str
  975        :return: The format of the input variable is being returned.
  976        """
  977
  978        if not output_file:
  979            output_file = self.get_output()
  980        output_format = get_file_format(output_file)
  981
  982        return output_format
  983
  984    def get_config(self) -> dict:
  985        """
  986        It returns the config
  987        :return: The config variable is being returned.
  988        """
  989        return self.config
  990
  991    def get_param(self) -> dict:
  992        """
  993        It returns the param
  994        :return: The param variable is being returned.
  995        """
  996        return self.param
  997
  998    def get_connexion_db(self) -> str:
  999        """
 1000        It returns the connexion_db attribute of the object
 1001        :return: The connexion_db is being returned.
 1002        """
 1003        return self.connexion_db
 1004
 1005    def get_prefix(self) -> str:
 1006        """
 1007        It returns the prefix of the object.
 1008        :return: The prefix is being returned.
 1009        """
 1010        return self.prefix
 1011
 1012    def get_table_variants(self, clause: str = "select") -> str:
 1013        """
 1014        This function returns the table_variants attribute of the object
 1015
 1016        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1017        defaults to select (optional)
 1018        :return: The table_variants attribute of the object.
 1019        """
 1020
 1021        # Access
 1022        access = self.get_config().get("access", None)
 1023
 1024        # Clauses "select", "where", "update"
 1025        if clause in ["select", "where", "update"]:
 1026            table_variants = self.table_variants
 1027        # Clause "from"
 1028        elif clause in ["from"]:
 1029            # For Read Only
 1030            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1031                input_file = self.get_input()
 1032                table_variants = f"'{input_file}' as variants"
 1033            # For Read Write
 1034            else:
 1035                table_variants = f"{self.table_variants} as variants"
 1036        else:
 1037            table_variants = self.table_variants
 1038        return table_variants
 1039
 1040    def get_tmp_dir(self) -> str:
 1041        """
 1042        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1043        parameters or a default path.
 1044        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1045        configuration, parameters, and a default value of "/tmp".
 1046        """
 1047
 1048        return get_tmp(
 1049            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1050        )
 1051
 1052    def get_connexion_type(self) -> str:
 1053        """
 1054        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1055
 1056        :return: The connexion type is being returned.
 1057        """
 1058        return self.get_config().get("connexion_type", "memory")
 1059
 1060    def get_connexion(self):
 1061        """
 1062        It returns the connection object
 1063
 1064        :return: The connection object.
 1065        """
 1066        return self.conn
 1067
 1068    def close_connexion(self) -> None:
 1069        """
 1070        This function closes the connection to the database.
 1071        :return: The connection is being closed.
 1072        """
 1073        return self.conn.close()
 1074
 1075    def get_header(self, type: str = "vcf"):
 1076        """
 1077        This function returns the header of the VCF file as a list of strings
 1078
 1079        :param type: the type of header you want to get, defaults to vcf (optional)
 1080        :return: The header of the vcf file.
 1081        """
 1082
 1083        if self.header_vcf:
 1084            if type == "vcf":
 1085                return self.header_vcf
 1086            elif type == "list":
 1087                return self.header_list
 1088        else:
 1089            if type == "vcf":
 1090                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1091                return header
 1092            elif type == "list":
 1093                return vcf_required
 1094
 1095    def get_header_length(self, file: str = None) -> int:
 1096        """
 1097        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1098        line.
 1099
 1100        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1101        header file. If this argument is provided, the function will read the header from the specified
 1102        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1103        :type file: str
 1104        :return: the length of the header list, excluding the #CHROM line.
 1105        """
 1106
 1107        if file:
 1108            return len(self.read_vcf_header_file(file=file)) - 1
 1109        elif self.get_header(type="list"):
 1110            return len(self.get_header(type="list")) - 1
 1111        else:
 1112            return 0
 1113
 1114    def get_header_columns(self) -> str:
 1115        """
 1116        This function returns the header list of a VCF
 1117
 1118        :return: The length of the header list.
 1119        """
 1120        if self.get_header():
 1121            return self.get_header(type="list")[-1]
 1122        else:
 1123            return ""
 1124
 1125    def get_header_columns_as_list(self) -> list:
 1126        """
 1127        This function returns the header list of a VCF
 1128
 1129        :return: The length of the header list.
 1130        """
 1131        if self.get_header():
 1132            return self.get_header_columns().strip().split("\t")
 1133        else:
 1134            return []
 1135
 1136    def get_header_columns_as_sql(self) -> str:
 1137        """
 1138        This function retruns header length (without #CHROM line)
 1139
 1140        :return: The length of the header list.
 1141        """
 1142        sql_column_list = []
 1143        for col in self.get_header_columns_as_list():
 1144            sql_column_list.append(f'"{col}"')
 1145        return ",".join(sql_column_list)
 1146
 1147    def get_header_sample_list(
 1148        self, check: bool = False, samples: list = None, samples_force: bool = False
 1149    ) -> list:
 1150        """
 1151        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1152        checking and filtering based on input parameters.
 1153
 1154        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1155        parameter that determines whether to check if the samples in the list are properly defined as
 1156        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1157        list is defined as a, defaults to False
 1158        :type check: bool (optional)
 1159        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1160        allows you to specify a subset of samples from the header. If you provide a list of sample
 1161        names, the function will check if each sample is defined in the header. If a sample is not found
 1162        in the
 1163        :type samples: list
 1164        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1165        a boolean parameter that determines whether to force the function to return the sample list
 1166        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1167        function will return the sample list without performing, defaults to False
 1168        :type samples_force: bool (optional)
 1169        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1170        parameters and conditions specified in the function.
 1171        """
 1172
 1173        # Init
 1174        samples_list = []
 1175
 1176        if samples is None:
 1177            samples_list = self.header_vcf.samples
 1178        else:
 1179            samples_checked = []
 1180            for sample in samples:
 1181                if sample in self.header_vcf.samples:
 1182                    samples_checked.append(sample)
 1183                else:
 1184                    log.warning(f"Sample '{sample}' not defined in header")
 1185            samples_list = samples_checked
 1186
 1187            # Force sample list without checking if is_genotype_column
 1188            if samples_force:
 1189                log.warning(f"Samples {samples_list} not checked if genotypes")
 1190                return samples_list
 1191
 1192        if check:
 1193            samples_checked = []
 1194            for sample in samples_list:
 1195                if self.is_genotype_column(column=sample):
 1196                    samples_checked.append(sample)
 1197                else:
 1198                    log.warning(
 1199                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1200                    )
 1201            samples_list = samples_checked
 1202
 1203        # Return samples list
 1204        return samples_list
 1205
 1206    def is_genotype_column(self, column: str = None) -> bool:
 1207        """
 1208        This function checks if a given column is a genotype column in a database.
 1209
 1210        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1211        represents the column name in a database table. This method checks if the specified column is a
 1212        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1213        method of
 1214        :type column: str
 1215        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1216        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1217        column name and returns the result. If the `column` parameter is None, it returns False.
 1218        """
 1219
 1220        if column is not None:
 1221            return Database(database=self.get_input()).is_genotype_column(column=column)
 1222        else:
 1223            return False
 1224
 1225    def get_verbose(self) -> bool:
 1226        """
 1227        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1228        exist
 1229
 1230        :return: The value of the key "verbose" in the config dictionary.
 1231        """
 1232        return self.get_config().get("verbose", False)
 1233
 1234    def get_connexion_format(self) -> str:
 1235        """
 1236        It returns the connexion format of the object.
 1237        :return: The connexion_format is being returned.
 1238        """
 1239        connexion_format = self.connexion_format
 1240        if connexion_format not in ["duckdb", "sqlite"]:
 1241            log.error(f"Unknown connexion format {connexion_format}")
 1242            raise ValueError(f"Unknown connexion format {connexion_format}")
 1243        else:
 1244            return connexion_format
 1245
 1246    def insert_file_to_table(
 1247        self,
 1248        file,
 1249        columns: str,
 1250        header_len: int = 0,
 1251        sep: str = "\t",
 1252        chunksize: int = 1000000,
 1253    ) -> None:
 1254        """
 1255        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1256        database format.
 1257
 1258        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1259        the path to the file on your system
 1260        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1261        should contain the names of the columns in the table where the data will be inserted. The column
 1262        names should be separated by commas within the string. For example, if you have columns named
 1263        "id", "name
 1264        :type columns: str
 1265        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1266        the number of lines to skip at the beginning of the file before reading the actual data. This
 1267        parameter allows you to skip any header information present in the file before processing the
 1268        data, defaults to 0
 1269        :type header_len: int (optional)
 1270        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1271        separator character that is used in the file being read. In this case, the default separator is
 1272        set to `\t`, which represents a tab character. You can change this parameter to a different
 1273        separator character if, defaults to \t
 1274        :type sep: str (optional)
 1275        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1276        when processing the file in chunks. In the provided code snippet, the default value for
 1277        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1278        to 1000000
 1279        :type chunksize: int (optional)
 1280        """
 1281
 1282        # Config
 1283        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1284        connexion_format = self.get_connexion_format()
 1285
 1286        log.debug("chunksize: " + str(chunksize))
 1287
 1288        if chunksize:
 1289            for chunk in pd.read_csv(
 1290                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1291            ):
 1292                if connexion_format in ["duckdb"]:
 1293                    sql_insert_into = (
 1294                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1295                    )
 1296                    self.conn.execute(sql_insert_into)
 1297                elif connexion_format in ["sqlite"]:
 1298                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1299
 1300    def load_data(
 1301        self,
 1302        input_file: str = None,
 1303        drop_variants_table: bool = False,
 1304        sample_size: int = 20480,
 1305    ) -> None:
 1306        """
 1307        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1308        table before loading the data and specify a sample size.
 1309
 1310        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1311        table
 1312        :type input_file: str
 1313        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1314        determines whether the variants table should be dropped before loading the data. If set to
 1315        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1316        not be dropped, defaults to False
 1317        :type drop_variants_table: bool (optional)
 1318        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1319        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1320        20480
 1321        :type sample_size: int (optional)
 1322        """
 1323
 1324        log.info("Loading...")
 1325
 1326        # change input file
 1327        if input_file:
 1328            self.set_input(input_file)
 1329            self.set_header()
 1330
 1331        # drop variants table
 1332        if drop_variants_table:
 1333            self.drop_variants_table()
 1334
 1335        # get table variants
 1336        table_variants = self.get_table_variants()
 1337
 1338        # Access
 1339        access = self.get_config().get("access", None)
 1340        log.debug(f"access: {access}")
 1341
 1342        # Input format and compress
 1343        input_format = self.get_input_format()
 1344        input_compressed = self.get_input_compressed()
 1345        log.debug(f"input_format: {input_format}")
 1346        log.debug(f"input_compressed: {input_compressed}")
 1347
 1348        # input_compressed_format
 1349        if input_compressed:
 1350            input_compressed_format = "gzip"
 1351        else:
 1352            input_compressed_format = "none"
 1353        log.debug(f"input_compressed_format: {input_compressed_format}")
 1354
 1355        # Connexion format
 1356        connexion_format = self.get_connexion_format()
 1357
 1358        # Sample size
 1359        if not sample_size:
 1360            sample_size = -1
 1361        log.debug(f"sample_size: {sample_size}")
 1362
 1363        # Load data
 1364        log.debug(f"Load Data from {input_format}")
 1365
 1366        # DuckDB connexion
 1367        if connexion_format in ["duckdb"]:
 1368
 1369            # Database already exists
 1370            if self.input_format in ["db", "duckdb"]:
 1371
 1372                if connexion_format in ["duckdb"]:
 1373                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1374                else:
 1375                    log.error(
 1376                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1377                    )
 1378                    raise ValueError(
 1379                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1380                    )
 1381
 1382            # Load from existing database format
 1383            else:
 1384
 1385                try:
 1386                    # Create Table or View
 1387                    database = Database(database=self.input)
 1388                    sql_from = database.get_sql_from(sample_size=sample_size)
 1389
 1390                    if access in ["RO"]:
 1391                        sql_load = (
 1392                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1393                        )
 1394                    else:
 1395                        sql_load = (
 1396                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1397                        )
 1398                    self.conn.execute(sql_load)
 1399
 1400                except:
 1401                    # Format not available
 1402                    log.error(f"Input file format '{self.input_format}' not available")
 1403                    raise ValueError(
 1404                        f"Input file format '{self.input_format}' not available"
 1405                    )
 1406
 1407        # SQLite connexion
 1408        elif connexion_format in ["sqlite"] and input_format in [
 1409            "vcf",
 1410            "tsv",
 1411            "csv",
 1412            "psv",
 1413        ]:
 1414
 1415            # Main structure
 1416            structure = {
 1417                "#CHROM": "VARCHAR",
 1418                "POS": "INTEGER",
 1419                "ID": "VARCHAR",
 1420                "REF": "VARCHAR",
 1421                "ALT": "VARCHAR",
 1422                "QUAL": "VARCHAR",
 1423                "FILTER": "VARCHAR",
 1424                "INFO": "VARCHAR",
 1425            }
 1426
 1427            # Strcuture with samples
 1428            structure_complete = structure
 1429            if self.get_header_sample_list():
 1430                structure["FORMAT"] = "VARCHAR"
 1431                for sample in self.get_header_sample_list():
 1432                    structure_complete[sample] = "VARCHAR"
 1433
 1434            # Columns list for create and insert
 1435            sql_create_table_columns = []
 1436            sql_create_table_columns_list = []
 1437            for column in structure_complete:
 1438                column_type = structure_complete[column]
 1439                sql_create_table_columns.append(
 1440                    f'"{column}" {column_type} default NULL'
 1441                )
 1442                sql_create_table_columns_list.append(f'"{column}"')
 1443
 1444            # Create database
 1445            log.debug(f"Create Table {table_variants}")
 1446            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1447            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1448            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1449            self.conn.execute(sql_create_table)
 1450
 1451            # chunksize define length of file chunk load file
 1452            chunksize = 100000
 1453
 1454            # delimiter
 1455            delimiter = file_format_delimiters.get(input_format, "\t")
 1456
 1457            # Load the input file
 1458            with open(self.input, "rt") as input_file:
 1459
 1460                # Use the appropriate file handler based on the input format
 1461                if input_compressed:
 1462                    input_file = bgzf.open(self.input, "rt")
 1463                if input_format in ["vcf"]:
 1464                    header_len = self.get_header_length()
 1465                else:
 1466                    header_len = 0
 1467
 1468                # Insert the file contents into a table
 1469                self.insert_file_to_table(
 1470                    input_file,
 1471                    columns=sql_create_table_columns_list_sql,
 1472                    header_len=header_len,
 1473                    sep=delimiter,
 1474                    chunksize=chunksize,
 1475                )
 1476
 1477        else:
 1478            log.error(
 1479                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1480            )
 1481            raise ValueError(
 1482                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1483            )
 1484
 1485        # Explode INFOS fields into table fields
 1486        if self.get_explode_infos():
 1487            self.explode_infos(
 1488                prefix=self.get_explode_infos_prefix(),
 1489                fields=self.get_explode_infos_fields(),
 1490                force=True,
 1491            )
 1492
 1493        # Create index after insertion
 1494        self.create_indexes()
 1495
 1496    def get_explode_infos(self) -> bool:
 1497        """
 1498        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1499        to False if it is not set.
 1500        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1501        value. If the parameter is not present, it will return False.
 1502        """
 1503
 1504        return self.get_param().get("explode", {}).get("explode_infos", False)
 1505
 1506    def get_explode_infos_fields(
 1507        self,
 1508        explode_infos_fields: str = None,
 1509        remove_fields_not_in_header: bool = False,
 1510    ) -> list:
 1511        """
 1512        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1513        the input parameter `explode_infos_fields`.
 1514
 1515        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1516        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1517        comma-separated list of field names to explode
 1518        :type explode_infos_fields: str
 1519        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1520        flag that determines whether to remove fields that are not present in the header. If it is set
 1521        to `True`, any field that is not in the header will be excluded from the list of exploded
 1522        information fields. If it is set to `, defaults to False
 1523        :type remove_fields_not_in_header: bool (optional)
 1524        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1525        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1526        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1527        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1528        splitting the string by commas.
 1529        """
 1530
 1531        # If no fields, get it in param
 1532        if not explode_infos_fields:
 1533            explode_infos_fields = (
 1534                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1535            )
 1536
 1537        # If no fields, defined as all fields in header using keyword
 1538        if not explode_infos_fields:
 1539            explode_infos_fields = "*"
 1540
 1541        # If fields list not empty
 1542        if explode_infos_fields:
 1543
 1544            # Input fields list
 1545            if isinstance(explode_infos_fields, str):
 1546                fields_input = explode_infos_fields.split(",")
 1547            elif isinstance(explode_infos_fields, list):
 1548                fields_input = explode_infos_fields
 1549            else:
 1550                fields_input = []
 1551
 1552            # Fields list without * keyword
 1553            fields_without_all = fields_input.copy()
 1554            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1555                fields_without_all.remove("*")
 1556
 1557            # Fields in header
 1558            fields_in_header = sorted(list(set(self.get_header().infos)))
 1559
 1560            # Construct list of fields
 1561            fields_output = []
 1562            for field in fields_input:
 1563
 1564                # Strip field
 1565                field = field.strip()
 1566
 1567                # format keyword * in regex
 1568                if field.upper() in ["*"]:
 1569                    field = ".*"
 1570
 1571                # Find all fields with pattern
 1572                r = re.compile(field)
 1573                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1574
 1575                # Remove fields input from search
 1576                if field in fields_search:
 1577                    fields_search = [field]
 1578                elif fields_search != [field]:
 1579                    fields_search = sorted(
 1580                        list(set(fields_search).difference(fields_input))
 1581                    )
 1582
 1583                # If field is not in header (avoid not well formatted header)
 1584                if not fields_search and not remove_fields_not_in_header:
 1585                    fields_search = [field]
 1586
 1587                # Add found fields
 1588                for new_field in fields_search:
 1589                    # Add field, if not already exists, and if it is in header (if asked)
 1590                    if (
 1591                        new_field not in fields_output
 1592                        and (
 1593                            not remove_fields_not_in_header
 1594                            or new_field in fields_in_header
 1595                        )
 1596                        and new_field not in [".*"]
 1597                    ):
 1598                        fields_output.append(new_field)
 1599
 1600            return fields_output
 1601
 1602        else:
 1603
 1604            return []
 1605
 1606    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1607        """
 1608        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1609        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1610        not provided.
 1611
 1612        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1613        prefix to be used for exploding or expanding information
 1614        :type explode_infos_prefix: str
 1615        :return: the value of the variable `explode_infos_prefix`.
 1616        """
 1617
 1618        if not explode_infos_prefix:
 1619            explode_infos_prefix = (
 1620                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1621            )
 1622
 1623        return explode_infos_prefix
 1624
 1625    def add_column(
 1626        self,
 1627        table_name,
 1628        column_name,
 1629        column_type,
 1630        default_value=None,
 1631        drop: bool = False,
 1632    ) -> dict:
 1633        """
 1634        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1635        doesn't already exist.
 1636
 1637        :param table_name: The name of the table to which you want to add a column
 1638        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1639        to the table
 1640        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1641        want to add to the table. It should be a string that represents the desired data type, such as
 1642        "INTEGER", "TEXT", "REAL", etc
 1643        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1644        default value for the newly added column. If a default value is provided, it will be assigned to
 1645        the column for any existing rows that do not have a value for that column
 1646        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1647        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1648        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1649        to False
 1650        :type drop: bool (optional)
 1651        :return: a boolean value indicating whether the column was successfully added to the table.
 1652        """
 1653
 1654        # added
 1655        added = False
 1656        dropped = False
 1657
 1658        # Check if the column already exists in the table
 1659        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1660        columns = self.get_query_to_df(query).columns.tolist()
 1661        if column_name.upper() in [c.upper() for c in columns]:
 1662            log.debug(
 1663                f"The {column_name} column already exists in the {table_name} table"
 1664            )
 1665            if drop:
 1666                self.drop_column(table_name=table_name, column_name=column_name)
 1667                dropped = True
 1668            else:
 1669                return None
 1670        else:
 1671            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1672
 1673        # Add column in table
 1674        add_column_query = (
 1675            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1676        )
 1677        if default_value is not None:
 1678            add_column_query += f" DEFAULT {default_value}"
 1679        self.execute_query(add_column_query)
 1680        added = not dropped
 1681        log.debug(
 1682            f"The {column_name} column was successfully added to the {table_name} table"
 1683        )
 1684
 1685        if added:
 1686            added_column = {
 1687                "table_name": table_name,
 1688                "column_name": column_name,
 1689                "column_type": column_type,
 1690                "default_value": default_value,
 1691            }
 1692        else:
 1693            added_column = None
 1694
 1695        return added_column
 1696
 1697    def drop_column(
 1698        self, column: dict = None, table_name: str = None, column_name: str = None
 1699    ) -> bool:
 1700        """
 1701        The `drop_column` function drops a specified column from a given table in a database and returns
 1702        True if the column was successfully dropped, and False if the column does not exist in the
 1703        table.
 1704
 1705        :param column: The `column` parameter is a dictionary that contains information about the column
 1706        you want to drop. It has two keys:
 1707        :type column: dict
 1708        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1709        drop a column
 1710        :type table_name: str
 1711        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1712        from the table
 1713        :type column_name: str
 1714        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1715        and False if the column does not exist in the table.
 1716        """
 1717
 1718        # Find column infos
 1719        if column:
 1720            if isinstance(column, dict):
 1721                table_name = column.get("table_name", None)
 1722                column_name = column.get("column_name", None)
 1723            elif isinstance(column, str):
 1724                table_name = self.get_table_variants()
 1725                column_name = column
 1726            else:
 1727                table_name = None
 1728                column_name = None
 1729
 1730        if not table_name and not column_name:
 1731            return False
 1732
 1733        # Removed
 1734        removed = False
 1735
 1736        # Check if the column already exists in the table
 1737        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1738        columns = self.get_query_to_df(query).columns.tolist()
 1739        if column_name in columns:
 1740            log.debug(f"The {column_name} column exists in the {table_name} table")
 1741        else:
 1742            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1743            return False
 1744
 1745        # Add column in table # ALTER TABLE integers DROP k
 1746        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1747        self.execute_query(add_column_query)
 1748        removed = True
 1749        log.debug(
 1750            f"The {column_name} column was successfully dropped to the {table_name} table"
 1751        )
 1752
 1753        return removed
 1754
 1755    def explode_infos(
 1756        self,
 1757        prefix: str = None,
 1758        create_index: bool = False,
 1759        fields: list = None,
 1760        force: bool = False,
 1761        proccess_all_fields_together: bool = False,
 1762        table: str = None,
 1763    ) -> list:
 1764        """
 1765        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1766        individual columns, returning a list of added columns.
 1767
 1768        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1769        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1770        `self.get_explode_infos_prefix()` as the prefix
 1771        :type prefix: str
 1772        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1773        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1774        `False`, indexes will not be created. The default value is `False`, defaults to False
 1775        :type create_index: bool (optional)
 1776        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1777        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1778        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1779        a list to the `
 1780        :type fields: list
 1781        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1782        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1783        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1784        defaults to False
 1785        :type force: bool (optional)
 1786        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1787        flag that determines whether to process all the INFO fields together or individually. If set to
 1788        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1789        be processed individually. The default value is, defaults to False
 1790        :type proccess_all_fields_together: bool (optional)
 1791        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1792        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1793        a value for the `table` parameter, the function will use that table name. If the `table`
 1794        parameter is
 1795        :type table: str
 1796        :return: The `explode_infos` function returns a list of added columns.
 1797        """
 1798
 1799        # drop indexes
 1800        self.drop_indexes()
 1801
 1802        # connexion format
 1803        connexion_format = self.get_connexion_format()
 1804
 1805        # Access
 1806        access = self.get_config().get("access", None)
 1807
 1808        # Added columns
 1809        added_columns = []
 1810
 1811        if access not in ["RO"]:
 1812
 1813            # prefix
 1814            if prefix in [None, True] or not isinstance(prefix, str):
 1815                if self.get_explode_infos_prefix() not in [None, True]:
 1816                    prefix = self.get_explode_infos_prefix()
 1817                else:
 1818                    prefix = "INFO/"
 1819
 1820            # table variants
 1821            if table is not None:
 1822                table_variants = table
 1823            else:
 1824                table_variants = self.get_table_variants(clause="select")
 1825
 1826            # extra infos
 1827            try:
 1828                extra_infos = self.get_extra_infos()
 1829            except:
 1830                extra_infos = []
 1831
 1832            # Header infos
 1833            header_infos = self.get_header().infos
 1834
 1835            log.debug(
 1836                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1837            )
 1838
 1839            sql_info_alter_table_array = []
 1840
 1841            # Info fields to check
 1842            fields_list = list(header_infos)
 1843            if fields:
 1844                fields_list += fields
 1845            fields_list = set(fields_list)
 1846
 1847            # If no fields
 1848            if not fields:
 1849                fields = []
 1850
 1851            # Translate fields if patterns
 1852            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1853
 1854            for info in fields:
 1855
 1856                info_id_sql = prefix + info
 1857
 1858                if (
 1859                    info in fields_list
 1860                    or prefix + info in fields_list
 1861                    or info in extra_infos
 1862                ):
 1863
 1864                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1865
 1866                    if info in header_infos:
 1867                        info_type = header_infos[info].type
 1868                        info_num = header_infos[info].num
 1869                    else:
 1870                        info_type = "String"
 1871                        info_num = 0
 1872
 1873                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1874                    if info_num != 1:
 1875                        type_sql = "VARCHAR"
 1876
 1877                    # Add field
 1878                    added_column = self.add_column(
 1879                        table_name=table_variants,
 1880                        column_name=info_id_sql,
 1881                        column_type=type_sql,
 1882                        default_value="null",
 1883                        drop=force,
 1884                    )
 1885
 1886                    if added_column:
 1887                        added_columns.append(added_column)
 1888
 1889                    if added_column or force:
 1890
 1891                        # add field to index
 1892                        self.index_additionnal_fields.append(info_id_sql)
 1893
 1894                        # Update field array
 1895                        if connexion_format in ["duckdb"]:
 1896                            update_info_field = f"""
 1897                            "{info_id_sql}" =
 1898                                CASE
 1899                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1900                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1901                                END
 1902                            """
 1903                        elif connexion_format in ["sqlite"]:
 1904                            update_info_field = f"""
 1905                                "{info_id_sql}" =
 1906                                    CASE
 1907                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1908                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1909                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1910                                    END
 1911                            """
 1912
 1913                        sql_info_alter_table_array.append(update_info_field)
 1914
 1915            if sql_info_alter_table_array:
 1916
 1917                # By chromosomes
 1918                try:
 1919                    chromosomes_list = list(
 1920                        self.get_query_to_df(
 1921                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1922                        )["#CHROM"]
 1923                    )
 1924                except:
 1925                    chromosomes_list = [None]
 1926
 1927                for chrom in chromosomes_list:
 1928                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1929
 1930                    # Where clause
 1931                    where_clause = ""
 1932                    if chrom and len(chromosomes_list) > 1:
 1933                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1934
 1935                    # Update table
 1936                    if proccess_all_fields_together:
 1937                        sql_info_alter_table_array_join = ", ".join(
 1938                            sql_info_alter_table_array
 1939                        )
 1940                        if sql_info_alter_table_array_join:
 1941                            sql_info_alter_table = f"""
 1942                                UPDATE {table_variants}
 1943                                SET {sql_info_alter_table_array_join}
 1944                                {where_clause}
 1945                                """
 1946                            log.debug(
 1947                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1948                            )
 1949                            # log.debug(sql_info_alter_table)
 1950                            self.conn.execute(sql_info_alter_table)
 1951                    else:
 1952                        sql_info_alter_num = 0
 1953                        for sql_info_alter in sql_info_alter_table_array:
 1954                            sql_info_alter_num += 1
 1955                            sql_info_alter_table = f"""
 1956                                UPDATE {table_variants}
 1957                                SET {sql_info_alter}
 1958                                {where_clause}
 1959                                """
 1960                            log.debug(
 1961                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1962                            )
 1963                            # log.debug(sql_info_alter_table)
 1964                            self.conn.execute(sql_info_alter_table)
 1965
 1966        # create indexes
 1967        if create_index:
 1968            self.create_indexes()
 1969
 1970        return added_columns
 1971
 1972    def create_indexes(self) -> None:
 1973        """
 1974        Create indexes on the table after insertion
 1975        """
 1976
 1977        # Access
 1978        access = self.get_config().get("access", None)
 1979
 1980        # get table variants
 1981        table_variants = self.get_table_variants("FROM")
 1982
 1983        if self.get_indexing() and access not in ["RO"]:
 1984            # Create index
 1985            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1986            self.conn.execute(sql_create_table_index)
 1987            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1988            self.conn.execute(sql_create_table_index)
 1989            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1990            self.conn.execute(sql_create_table_index)
 1991            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1992            self.conn.execute(sql_create_table_index)
 1993            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1994            self.conn.execute(sql_create_table_index)
 1995            for field in self.index_additionnal_fields:
 1996                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1997                self.conn.execute(sql_create_table_index)
 1998
 1999    def drop_indexes(self) -> None:
 2000        """
 2001        Create indexes on the table after insertion
 2002        """
 2003
 2004        # Access
 2005        access = self.get_config().get("access", None)
 2006
 2007        # get table variants
 2008        table_variants = self.get_table_variants("FROM")
 2009
 2010        # Get database format
 2011        connexion_format = self.get_connexion_format()
 2012
 2013        if access not in ["RO"]:
 2014            if connexion_format in ["duckdb"]:
 2015                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2016            elif connexion_format in ["sqlite"]:
 2017                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2018
 2019            list_indexes = self.conn.execute(sql_list_indexes)
 2020            index_names = [row[0] for row in list_indexes.fetchall()]
 2021            for index in index_names:
 2022                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2023                self.conn.execute(sql_drop_table_index)
 2024
 2025    def read_vcf_header(self, f) -> list:
 2026        """
 2027        It reads the header of a VCF file and returns a list of the header lines
 2028
 2029        :param f: the file object
 2030        :return: The header lines of the VCF file.
 2031        """
 2032
 2033        header_list = []
 2034        for line in f:
 2035            header_list.append(line)
 2036            if line.startswith("#CHROM"):
 2037                break
 2038        return header_list
 2039
 2040    def read_vcf_header_file(self, file: str = None) -> list:
 2041        """
 2042        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2043        uncompressed files.
 2044
 2045        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2046        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2047        default to `None`
 2048        :type file: str
 2049        :return: The function `read_vcf_header_file` returns a list.
 2050        """
 2051
 2052        if self.get_input_compressed(input_file=file):
 2053            with bgzf.open(file, "rt") as f:
 2054                return self.read_vcf_header(f=f)
 2055        else:
 2056            with open(file, "rt") as f:
 2057                return self.read_vcf_header(f=f)
 2058
 2059    def execute_query(self, query: str):
 2060        """
 2061        It takes a query as an argument, executes it, and returns the results
 2062
 2063        :param query: The query to be executed
 2064        :return: The result of the query is being returned.
 2065        """
 2066        if query:
 2067            return self.conn.execute(query)  # .fetchall()
 2068        else:
 2069            return None
 2070
 2071    def export_output(
 2072        self,
 2073        output_file: str | None = None,
 2074        output_header: str | None = None,
 2075        export_header: bool = True,
 2076        query: str | None = None,
 2077        parquet_partitions: list | None = None,
 2078        chunk_size: int | None = None,
 2079        threads: int | None = None,
 2080        sort: bool = False,
 2081        index: bool = False,
 2082        order_by: str | None = None,
 2083    ) -> bool:
 2084        """
 2085        The `export_output` function exports data from a VCF file to a specified output file in various
 2086        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2087
 2088        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2089        output file to be generated by the function. This is where the exported data will be saved
 2090        :type output_file: str
 2091        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2092        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2093        header will be exported to a file with the same name as the `output_file` parameter, but with
 2094        the extension "
 2095        :type output_header: str
 2096        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2097        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2098        True, the header will be exported to a file. If `export_header` is False, the header will not
 2099        be, defaults to True, if output format is not VCF
 2100        :type export_header: bool (optional)
 2101        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2102        select specific data from the VCF file before exporting it. If provided, only the data that
 2103        matches the query will be exported
 2104        :type query: str
 2105        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2106        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2107        organize data in a hierarchical directory structure based on the values of one or more columns.
 2108        This can improve query performance when working with large datasets
 2109        :type parquet_partitions: list
 2110        :param chunk_size: The `chunk_size` parameter specifies the number of
 2111        records in batch when exporting data in Parquet format. This parameter is used for
 2112        partitioning the Parquet file into multiple files.
 2113        :type chunk_size: int
 2114        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2115        threads to be used during the export process. It determines the level of parallelism and can
 2116        improve the performance of the export operation. If not provided, the function will use the
 2117        default number of threads
 2118        :type threads: int
 2119        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2120        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2121        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2122        False
 2123        :type sort: bool (optional)
 2124        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2125        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2126        no index will be created. The default value is False, defaults to False
 2127        :type index: bool (optional)
 2128        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2129        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2130        :type order_by: str
 2131        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2132        None if it doesn't.
 2133        """
 2134
 2135        # Log
 2136        log.info("Exporting...")
 2137
 2138        # Full path
 2139        output_file = full_path(output_file)
 2140        output_header = full_path(output_header)
 2141
 2142        # Config
 2143        config = self.get_config()
 2144
 2145        # Param
 2146        param = self.get_param()
 2147
 2148        # Tmp files to remove
 2149        tmp_to_remove = []
 2150
 2151        # If no output, get it
 2152        if not output_file:
 2153            output_file = self.get_output()
 2154
 2155        # If not threads
 2156        if not threads:
 2157            threads = self.get_threads()
 2158
 2159        # Auto header name with extension
 2160        if export_header or output_header:
 2161            if not output_header:
 2162                output_header = f"{output_file}.hdr"
 2163            # Export header
 2164            self.export_header(output_file=output_file)
 2165
 2166        # Switch off export header if VCF output
 2167        output_file_type = get_file_format(output_file)
 2168        if output_file_type in ["vcf"]:
 2169            export_header = False
 2170            tmp_to_remove.append(output_header)
 2171
 2172        # Chunk size
 2173        if not chunk_size:
 2174            chunk_size = config.get("chunk_size", None)
 2175
 2176        # Parquet partition
 2177        if not parquet_partitions:
 2178            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2179        if parquet_partitions and isinstance(parquet_partitions, str):
 2180            parquet_partitions = parquet_partitions.split(",")
 2181
 2182        # Order by
 2183        if not order_by:
 2184            order_by = param.get("export", {}).get("order_by", "")
 2185
 2186        # Header in output
 2187        header_in_output = param.get("export", {}).get("include_header", False)
 2188
 2189        # Database
 2190        database_source = self.get_connexion()
 2191
 2192        # Connexion format
 2193        connexion_format = self.get_connexion_format()
 2194
 2195        # Explode infos
 2196        if self.get_explode_infos():
 2197            self.explode_infos(
 2198                prefix=self.get_explode_infos_prefix(),
 2199                fields=self.get_explode_infos_fields(),
 2200                force=False,
 2201            )
 2202
 2203        # if connexion_format in ["sqlite"] or query:
 2204        if connexion_format in ["sqlite"]:
 2205
 2206            # Export in Parquet
 2207            random_tmp = "".join(
 2208                random.choice(string.ascii_lowercase) for i in range(10)
 2209            )
 2210            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2211            tmp_to_remove.append(database_source)
 2212
 2213            # Table Variants
 2214            table_variants = self.get_table_variants()
 2215
 2216            # Create export query
 2217            sql_query_export_subquery = f"""
 2218                SELECT * FROM {table_variants}
 2219                """
 2220
 2221            # Write source file
 2222            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2223
 2224        # Create database
 2225        database = Database(
 2226            database=database_source,
 2227            table="variants",
 2228            header_file=output_header,
 2229            conn_config=self.get_connexion_config(),
 2230        )
 2231
 2232        # Existing colomns header
 2233        existing_columns_header = database.get_header_columns_from_database()
 2234
 2235        # Sample list
 2236        get_samples = self.get_samples()
 2237        get_samples_check = self.get_samples_check()
 2238        samples_force = get_samples is not None
 2239        sample_list = self.get_header_sample_list(
 2240            check=get_samples_check, samples=get_samples, samples_force=samples_force
 2241        )
 2242
 2243        # Export file
 2244        database.export(
 2245            output_database=output_file,
 2246            output_header=output_header,
 2247            existing_columns_header=existing_columns_header,
 2248            parquet_partitions=parquet_partitions,
 2249            chunk_size=chunk_size,
 2250            threads=threads,
 2251            sort=sort,
 2252            index=index,
 2253            header_in_output=header_in_output,
 2254            order_by=order_by,
 2255            query=query,
 2256            export_header=export_header,
 2257            sample_list=sample_list,
 2258        )
 2259
 2260        # Remove
 2261        remove_if_exists(tmp_to_remove)
 2262
 2263        return (os.path.exists(output_file) or None) and (
 2264            os.path.exists(output_file) or None
 2265        )
 2266
 2267    def get_extra_infos(self, table: str = None) -> list:
 2268        """
 2269        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2270        in the header.
 2271
 2272        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2273        name of the table from which you want to retrieve the extra columns that are not present in the
 2274        header. If the `table` parameter is not provided when calling the function, it will default to
 2275        using the variants
 2276        :type table: str
 2277        :return: A list of columns that are in the specified table but not in the header of the table.
 2278        """
 2279
 2280        header_columns = []
 2281
 2282        if not table:
 2283            table = self.get_table_variants(clause="from")
 2284            header_columns = self.get_header_columns()
 2285
 2286        # Check all columns in the database
 2287        query = f""" SELECT * FROM {table} LIMIT 1 """
 2288        log.debug(f"query {query}")
 2289        table_columns = self.get_query_to_df(query).columns.tolist()
 2290        extra_columns = []
 2291
 2292        # Construct extra infos (not in header)
 2293        for column in table_columns:
 2294            if column not in header_columns:
 2295                extra_columns.append(column)
 2296
 2297        return extra_columns
 2298
 2299    def get_extra_infos_sql(self, table: str = None) -> str:
 2300        """
 2301        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2302        by double quotes
 2303
 2304        :param table: The name of the table to get the extra infos from. If None, the default table is
 2305        used
 2306        :type table: str
 2307        :return: A string of the extra infos
 2308        """
 2309
 2310        return ", ".join(
 2311            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2312        )
 2313
 2314    def export_header(
 2315        self,
 2316        header_name: str = None,
 2317        output_file: str = None,
 2318        output_file_ext: str = ".hdr",
 2319        clean_header: bool = True,
 2320        remove_chrom_line: bool = False,
 2321    ) -> str:
 2322        """
 2323        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2324        specified options, and writes it to a new file.
 2325
 2326        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2327        this parameter is not specified, the header will be written to the output file
 2328        :type header_name: str
 2329        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2330        specify the name of the output file where the header will be written. If this parameter is not
 2331        provided, the header will be written to a temporary file
 2332        :type output_file: str
 2333        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2334        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2335        if not specified by the user. This extension will be appended to the `output_file` name to
 2336        create the final, defaults to .hdr
 2337        :type output_file_ext: str (optional)
 2338        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2339        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2340        `True`, the function will clean the header by modifying certain lines based on a specific
 2341        pattern. If `clean_header`, defaults to True
 2342        :type clean_header: bool (optional)
 2343        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2344        boolean flag that determines whether the #CHROM line should be removed from the header before
 2345        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2346        defaults to False
 2347        :type remove_chrom_line: bool (optional)
 2348        :return: The function `export_header` returns the name of the temporary header file that is
 2349        created.
 2350        """
 2351
 2352        if not header_name and not output_file:
 2353            output_file = self.get_output()
 2354
 2355        if self.get_header():
 2356
 2357            # Get header object
 2358            header_obj = self.get_header()
 2359
 2360            # Create database
 2361            db_for_header = Database(database=self.get_input())
 2362
 2363            # Get real columns in the file
 2364            db_header_columns = db_for_header.get_columns()
 2365
 2366            with tempfile.TemporaryDirectory() as tmpdir:
 2367
 2368                # Write header file
 2369                header_file_tmp = os.path.join(tmpdir, "header")
 2370                f = open(header_file_tmp, "w")
 2371                vcf.Writer(f, header_obj)
 2372                f.close()
 2373
 2374                # Replace #CHROM line with rel columns
 2375                header_list = db_for_header.read_header_file(
 2376                    header_file=header_file_tmp
 2377                )
 2378                header_list[-1] = "\t".join(db_header_columns)
 2379
 2380                # Remove CHROM line
 2381                if remove_chrom_line:
 2382                    header_list.pop()
 2383
 2384                # Clean header
 2385                if clean_header:
 2386                    header_list_clean = []
 2387                    for head in header_list:
 2388                        # Clean head for malformed header
 2389                        head_clean = head
 2390                        head_clean = re.subn(
 2391                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2392                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2393                            head_clean,
 2394                            2,
 2395                        )[0]
 2396                        # Write header
 2397                        header_list_clean.append(head_clean)
 2398                    header_list = header_list_clean
 2399
 2400            tmp_header_name = output_file + output_file_ext
 2401
 2402            f = open(tmp_header_name, "w")
 2403            for line in header_list:
 2404                f.write(line)
 2405            f.close()
 2406
 2407        return tmp_header_name
 2408
 2409    def export_variant_vcf(
 2410        self,
 2411        vcf_file,
 2412        remove_info: bool = False,
 2413        add_samples: bool = True,
 2414        list_samples: list = [],
 2415        where_clause: str = "",
 2416        index: bool = False,
 2417        threads: int | None = None,
 2418    ) -> bool | None:
 2419        """
 2420        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2421        remove INFO field, add samples, and control compression and indexing.
 2422
 2423        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2424        written to. It is the output file that will contain the filtered VCF data based on the specified
 2425        parameters
 2426        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2427        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2428        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2429        in, defaults to False
 2430        :type remove_info: bool (optional)
 2431        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2432        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2433        If set to False, the samples will be removed. The default value is True, defaults to True
 2434        :type add_samples: bool (optional)
 2435        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2436        in the output VCF file. By default, all samples will be included. If you provide a list of
 2437        samples, only those samples will be included in the output file
 2438        :type list_samples: list
 2439        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2440        determines whether or not to create an index for the output VCF file. If `index` is set to
 2441        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2442        :type index: bool (optional)
 2443        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2444        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2445        will be used during the export process. More threads can potentially speed up the export process
 2446        by utilizing multiple cores of the processor. If
 2447        :type threads: int | None
 2448        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2449        method with various parameters including the output file, query, threads, sort flag, and index
 2450        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2451        specified parameters and configurations provided in the `export_variant_vcf` function.
 2452        """
 2453
 2454        # Config
 2455        config = self.get_config()
 2456
 2457        # Extract VCF
 2458        log.debug("Export VCF...")
 2459
 2460        # Table variants
 2461        table_variants = self.get_table_variants()
 2462
 2463        # Threads
 2464        if not threads:
 2465            threads = self.get_threads()
 2466
 2467        # Info fields
 2468        if remove_info:
 2469            if not isinstance(remove_info, str):
 2470                remove_info = "."
 2471            info_field = f"""'{remove_info}' as INFO"""
 2472        else:
 2473            info_field = "INFO"
 2474
 2475        # Samples fields
 2476        if add_samples:
 2477            if not list_samples:
 2478                list_samples = self.get_header_sample_list()
 2479            if list_samples:
 2480                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2481            else:
 2482                samples_fields = ""
 2483            log.debug(f"samples_fields: {samples_fields}")
 2484        else:
 2485            samples_fields = ""
 2486
 2487        # Where clause
 2488        if where_clause is None:
 2489            where_clause = ""
 2490
 2491        # Variants
 2492        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2493        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2494        log.debug(f"sql_query_select={sql_query_select}")
 2495
 2496        return self.export_output(
 2497            output_file=vcf_file,
 2498            output_header=None,
 2499            export_header=True,
 2500            query=sql_query_select,
 2501            parquet_partitions=None,
 2502            chunk_size=config.get("chunk_size", None),
 2503            threads=threads,
 2504            sort=True,
 2505            index=index,
 2506            order_by=None,
 2507        )
 2508
 2509    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2510        """
 2511        It takes a list of commands and runs them in parallel using the number of threads specified
 2512
 2513        :param commands: A list of commands to run
 2514        :param threads: The number of threads to use, defaults to 1 (optional)
 2515        """
 2516
 2517        run_parallel_commands(commands, threads)
 2518
 2519    def get_threads(self, default: int = 1) -> int:
 2520        """
 2521        This function returns the number of threads to use for a job, with a default value of 1 if not
 2522        specified.
 2523
 2524        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2525        default number of threads to use if no specific value is provided. If no value is provided for
 2526        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2527        used, defaults to 1
 2528        :type default: int (optional)
 2529        :return: the number of threads to use for the current job.
 2530        """
 2531
 2532        # Config
 2533        config = self.get_config()
 2534
 2535        # Param
 2536        param = self.get_param()
 2537
 2538        # Input threads
 2539        input_thread = param.get("threads", config.get("threads", None))
 2540
 2541        # Check threads
 2542        if not input_thread:
 2543            threads = default
 2544        elif int(input_thread) <= 0:
 2545            threads = os.cpu_count()
 2546        else:
 2547            threads = int(input_thread)
 2548        return threads
 2549
 2550    def get_memory(self, default: str = None) -> str:
 2551        """
 2552        This function retrieves the memory value from parameters or configuration with a default value
 2553        if not found.
 2554
 2555        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2556        default value is used as a fallback in case the `memory` parameter is not provided in the
 2557        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2558        the function
 2559        :type default: str
 2560        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2561        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2562        return the default value provided as an argument to the function.
 2563        """
 2564
 2565        # Config
 2566        config = self.get_config()
 2567
 2568        # Param
 2569        param = self.get_param()
 2570
 2571        # Input threads
 2572        input_memory = param.get("memory", config.get("memory", None))
 2573
 2574        # Check threads
 2575        if input_memory:
 2576            memory = input_memory
 2577        else:
 2578            memory = default
 2579
 2580        return memory
 2581
 2582    def update_from_vcf(self, vcf_file: str) -> None:
 2583        """
 2584        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2585
 2586        :param vcf_file: the path to the VCF file
 2587        """
 2588
 2589        connexion_format = self.get_connexion_format()
 2590
 2591        if connexion_format in ["duckdb"]:
 2592            self.update_from_vcf_duckdb(vcf_file)
 2593        elif connexion_format in ["sqlite"]:
 2594            self.update_from_vcf_sqlite(vcf_file)
 2595
 2596    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2597        """
 2598        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2599        INFO column of the VCF file
 2600
 2601        :param vcf_file: the path to the VCF file
 2602        """
 2603
 2604        # varaints table
 2605        table_variants = self.get_table_variants()
 2606
 2607        # Loading VCF into temporaire table
 2608        skip = self.get_header_length(file=vcf_file)
 2609        vcf_df = pd.read_csv(
 2610            vcf_file,
 2611            sep="\t",
 2612            engine="c",
 2613            skiprows=skip,
 2614            header=0,
 2615            low_memory=False,
 2616        )
 2617        sql_query_update = f"""
 2618        UPDATE {table_variants} as table_variants
 2619            SET INFO = concat(
 2620                            CASE
 2621                                WHEN INFO NOT IN ('', '.')
 2622                                THEN INFO
 2623                                ELSE ''
 2624                            END,
 2625                            (
 2626                                SELECT 
 2627                                    concat(
 2628                                        CASE
 2629                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2630                                            THEN ';'
 2631                                            ELSE ''
 2632                                        END
 2633                                        ,
 2634                                        CASE
 2635                                            WHEN table_parquet.INFO NOT IN ('','.')
 2636                                            THEN table_parquet.INFO
 2637                                            ELSE ''
 2638                                        END
 2639                                    )
 2640                                FROM vcf_df as table_parquet
 2641                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2642                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2643                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2644                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2645                                        AND table_parquet.INFO NOT IN ('','.')
 2646                            )
 2647                        )
 2648            ;
 2649            """
 2650        self.conn.execute(sql_query_update)
 2651
 2652    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2653        """
 2654        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2655        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2656        table
 2657
 2658        :param vcf_file: The path to the VCF file you want to update the database with
 2659        """
 2660
 2661        # Create a temporary table for the VCF
 2662        table_vcf = "tmp_vcf"
 2663        sql_create = (
 2664            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2665        )
 2666        self.conn.execute(sql_create)
 2667
 2668        # Loading VCF into temporaire table
 2669        vcf_df = pd.read_csv(
 2670            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2671        )
 2672        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2673        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2674
 2675        # Update table 'variants' with VCF data
 2676        # warning: CONCAT as || operator
 2677        sql_query_update = f"""
 2678            UPDATE variants as table_variants
 2679            SET INFO = CASE
 2680                            WHEN INFO NOT IN ('', '.')
 2681                            THEN INFO
 2682                            ELSE ''
 2683                        END ||
 2684                        (
 2685                        SELECT 
 2686                            CASE 
 2687                                WHEN table_variants.INFO NOT IN ('','.') 
 2688                                    AND table_vcf.INFO NOT IN ('','.')  
 2689                                THEN ';' 
 2690                                ELSE '' 
 2691                            END || 
 2692                            CASE 
 2693                                WHEN table_vcf.INFO NOT IN ('','.') 
 2694                                THEN table_vcf.INFO 
 2695                                ELSE '' 
 2696                            END
 2697                        FROM {table_vcf} as table_vcf
 2698                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2699                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2700                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2701                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2702                        )
 2703        """
 2704        self.conn.execute(sql_query_update)
 2705
 2706        # Drop temporary table
 2707        sql_drop = f"DROP TABLE {table_vcf}"
 2708        self.conn.execute(sql_drop)
 2709
 2710    def drop_variants_table(self) -> None:
 2711        """
 2712        > This function drops the variants table
 2713        """
 2714
 2715        table_variants = self.get_table_variants()
 2716        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2717        self.conn.execute(sql_table_variants)
 2718
 2719    def set_variant_id(
 2720        self, variant_id_column: str = "variant_id", force: bool = None
 2721    ) -> str:
 2722        """
 2723        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2724        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2725
 2726        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2727        to variant_id
 2728        :type variant_id_column: str (optional)
 2729        :param force: If True, the variant_id column will be created even if it already exists
 2730        :type force: bool
 2731        :return: The name of the column that contains the variant_id
 2732        """
 2733
 2734        # Assembly
 2735        assembly = self.get_param().get(
 2736            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2737        )
 2738
 2739        # INFO/Tag prefix
 2740        prefix = self.get_explode_infos_prefix()
 2741
 2742        # Explode INFO/SVTYPE
 2743        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2744
 2745        # variants table
 2746        table_variants = self.get_table_variants()
 2747
 2748        # variant_id column
 2749        if not variant_id_column:
 2750            variant_id_column = "variant_id"
 2751
 2752        # Creta variant_id column
 2753        if "variant_id" not in self.get_extra_infos() or force:
 2754
 2755            # Create column
 2756            self.add_column(
 2757                table_name=table_variants,
 2758                column_name=variant_id_column,
 2759                column_type="UBIGINT",
 2760                default_value="0",
 2761            )
 2762
 2763            # Update column
 2764            self.conn.execute(
 2765                f"""
 2766                    UPDATE {table_variants}
 2767                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2768                """
 2769            )
 2770
 2771        # Remove added columns
 2772        for added_column in added_columns:
 2773            self.drop_column(column=added_column)
 2774
 2775        # return variant_id column name
 2776        return variant_id_column
 2777
 2778    def get_variant_id_column(
 2779        self, variant_id_column: str = "variant_id", force: bool = None
 2780    ) -> str:
 2781        """
 2782        This function returns the variant_id column name
 2783
 2784        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2785        defaults to variant_id
 2786        :type variant_id_column: str (optional)
 2787        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2788        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2789        if it is not already set, or if it is set
 2790        :type force: bool
 2791        :return: The variant_id column name.
 2792        """
 2793
 2794        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2795
 2796    ###
 2797    # Annotation
 2798    ###
 2799
 2800    def scan_databases(
 2801        self,
 2802        database_formats: list = ["parquet"],
 2803        database_releases: list = ["current"],
 2804    ) -> dict:
 2805        """
 2806        The function `scan_databases` scans for available databases based on specified formats and
 2807        releases.
 2808
 2809        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2810        of the databases to be scanned. In this case, the accepted format is "parquet"
 2811        :type database_formats: list ["parquet"]
 2812        :param database_releases: The `database_releases` parameter is a list that specifies the
 2813        releases of the databases to be scanned. In the provided function, the default value for
 2814        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2815        databases that are in the "current"
 2816        :type database_releases: list
 2817        :return: The function `scan_databases` returns a dictionary containing information about
 2818        databases that match the specified formats and releases.
 2819        """
 2820
 2821        # Config
 2822        config = self.get_config()
 2823
 2824        # Param
 2825        param = self.get_param()
 2826
 2827        # Param - Assembly
 2828        assembly = param.get("assembly", config.get("assembly", None))
 2829        if not assembly:
 2830            assembly = DEFAULT_ASSEMBLY
 2831            log.warning(f"Default assembly '{assembly}'")
 2832
 2833        # Scan for availabled databases
 2834        log.info(
 2835            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2836        )
 2837        databases_infos_dict = databases_infos(
 2838            database_folder_releases=database_releases,
 2839            database_formats=database_formats,
 2840            assembly=assembly,
 2841            config=config,
 2842        )
 2843        log.info(
 2844            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2845        )
 2846
 2847        return databases_infos_dict
 2848
 2849    def annotation(self) -> None:
 2850        """
 2851        It annotates the VCF file with the annotations specified in the config file.
 2852        """
 2853
 2854        # Config
 2855        config = self.get_config()
 2856
 2857        # Param
 2858        param = self.get_param()
 2859
 2860        # Param - Assembly
 2861        assembly = param.get("assembly", config.get("assembly", None))
 2862        if not assembly:
 2863            assembly = DEFAULT_ASSEMBLY
 2864            log.warning(f"Default assembly '{assembly}'")
 2865
 2866        # annotations databases folders
 2867        annotations_databases = set(
 2868            config.get("folders", {})
 2869            .get("databases", {})
 2870            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2871            + config.get("folders", {})
 2872            .get("databases", {})
 2873            .get("parquet", ["~/howard/databases/parquet/current"])
 2874            + config.get("folders", {})
 2875            .get("databases", {})
 2876            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2877        )
 2878
 2879        # Get param annotations
 2880        if param.get("annotations", None) and isinstance(
 2881            param.get("annotations", None), str
 2882        ):
 2883            log.debug(param.get("annotations", None))
 2884            param_annotation_list = param.get("annotations").split(",")
 2885        else:
 2886            param_annotation_list = []
 2887
 2888        # Each tools param
 2889        if param.get("annotation_parquet", None) != None:
 2890            log.debug(
 2891                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2892            )
 2893            if isinstance(param.get("annotation_parquet", None), list):
 2894                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2895            else:
 2896                param_annotation_list.append(param.get("annotation_parquet"))
 2897        if param.get("annotation_snpsift", None) != None:
 2898            if isinstance(param.get("annotation_snpsift", None), list):
 2899                param_annotation_list.append(
 2900                    "snpsift:"
 2901                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2902                )
 2903            else:
 2904                param_annotation_list.append(
 2905                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2906                )
 2907        if param.get("annotation_snpeff", None) != None:
 2908            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2909        if param.get("annotation_bcftools", None) != None:
 2910            if isinstance(param.get("annotation_bcftools", None), list):
 2911                param_annotation_list.append(
 2912                    "bcftools:"
 2913                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2914                )
 2915            else:
 2916                param_annotation_list.append(
 2917                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2918                )
 2919        if param.get("annotation_annovar", None) != None:
 2920            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2921        if param.get("annotation_exomiser", None) != None:
 2922            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2923        if param.get("annotation_splice", None) != None:
 2924            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2925
 2926        # Merge param annotations list
 2927        param["annotations"] = ",".join(param_annotation_list)
 2928
 2929        # debug
 2930        log.debug(f"param_annotations={param['annotations']}")
 2931
 2932        if param.get("annotations"):
 2933
 2934            # Log
 2935            # log.info("Annotations - Check annotation parameters")
 2936
 2937            if not "annotation" in param:
 2938                param["annotation"] = {}
 2939
 2940            # List of annotations parameters
 2941            annotations_list_input = {}
 2942            if isinstance(param.get("annotations", None), str):
 2943                annotation_file_list = [
 2944                    value for value in param.get("annotations", "").split(",")
 2945                ]
 2946                for annotation_file in annotation_file_list:
 2947                    annotations_list_input[annotation_file] = {"INFO": None}
 2948            else:
 2949                annotations_list_input = param.get("annotations", {})
 2950
 2951            log.info(f"Quick Annotations:")
 2952            for annotation_key in list(annotations_list_input.keys()):
 2953                log.info(f"   {annotation_key}")
 2954
 2955            # List of annotations and associated fields
 2956            annotations_list = {}
 2957
 2958            for annotation_file in annotations_list_input:
 2959
 2960                # Explode annotations if ALL
 2961                if (
 2962                    annotation_file.upper() == "ALL"
 2963                    or annotation_file.upper().startswith("ALL:")
 2964                ):
 2965
 2966                    # check ALL parameters (formats, releases)
 2967                    annotation_file_split = annotation_file.split(":")
 2968                    database_formats = "parquet"
 2969                    database_releases = "current"
 2970                    for annotation_file_option in annotation_file_split[1:]:
 2971                        database_all_options_split = annotation_file_option.split("=")
 2972                        if database_all_options_split[0] == "format":
 2973                            database_formats = database_all_options_split[1].split("+")
 2974                        if database_all_options_split[0] == "release":
 2975                            database_releases = database_all_options_split[1].split("+")
 2976
 2977                    # Scan for availabled databases
 2978                    databases_infos_dict = self.scan_databases(
 2979                        database_formats=database_formats,
 2980                        database_releases=database_releases,
 2981                    )
 2982
 2983                    # Add found databases in annotation parameters
 2984                    for database_infos in databases_infos_dict.keys():
 2985                        annotations_list[database_infos] = {"INFO": None}
 2986
 2987                else:
 2988                    annotations_list[annotation_file] = annotations_list_input[
 2989                        annotation_file
 2990                    ]
 2991
 2992            # Check each databases
 2993            if len(annotations_list):
 2994
 2995                log.info(
 2996                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2997                )
 2998
 2999                for annotation_file in annotations_list:
 3000
 3001                    # Init
 3002                    annotations = annotations_list.get(annotation_file, None)
 3003
 3004                    # Annotation snpEff
 3005                    if annotation_file.startswith("snpeff"):
 3006
 3007                        log.debug(f"Quick Annotation snpEff")
 3008
 3009                        if "snpeff" not in param["annotation"]:
 3010                            param["annotation"]["snpeff"] = {}
 3011
 3012                        if "options" not in param["annotation"]["snpeff"]:
 3013                            param["annotation"]["snpeff"]["options"] = ""
 3014
 3015                        # snpEff options in annotations
 3016                        param["annotation"]["snpeff"]["options"] = "".join(
 3017                            annotation_file.split(":")[1:]
 3018                        )
 3019
 3020                    # Annotation Annovar
 3021                    elif annotation_file.startswith("annovar"):
 3022
 3023                        log.debug(f"Quick Annotation Annovar")
 3024
 3025                        if "annovar" not in param["annotation"]:
 3026                            param["annotation"]["annovar"] = {}
 3027
 3028                        if "annotations" not in param["annotation"]["annovar"]:
 3029                            param["annotation"]["annovar"]["annotations"] = {}
 3030
 3031                        # Options
 3032                        annotation_file_split = annotation_file.split(":")
 3033                        for annotation_file_annotation in annotation_file_split[1:]:
 3034                            if annotation_file_annotation:
 3035                                param["annotation"]["annovar"]["annotations"][
 3036                                    annotation_file_annotation
 3037                                ] = annotations
 3038
 3039                    # Annotation Exomiser
 3040                    elif annotation_file.startswith("exomiser"):
 3041
 3042                        log.debug(f"Quick Annotation Exomiser")
 3043
 3044                        param["annotation"]["exomiser"] = params_string_to_dict(
 3045                            annotation_file
 3046                        )
 3047
 3048                    # Annotation Splice
 3049                    elif annotation_file.startswith("splice"):
 3050
 3051                        log.debug(f"Quick Annotation Splice")
 3052
 3053                        param["annotation"]["splice"] = params_string_to_dict(
 3054                            annotation_file
 3055                        )
 3056
 3057                    # Annotation Parquet or BCFTOOLS
 3058                    else:
 3059
 3060                        # Tools detection
 3061                        if annotation_file.startswith("bcftools:"):
 3062                            annotation_tool_initial = "bcftools"
 3063                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3064                        elif annotation_file.startswith("snpsift:"):
 3065                            annotation_tool_initial = "snpsift"
 3066                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3067                        else:
 3068                            annotation_tool_initial = None
 3069
 3070                        # list of files
 3071                        annotation_file_list = annotation_file.replace("+", ":").split(
 3072                            ":"
 3073                        )
 3074
 3075                        for annotation_file in annotation_file_list:
 3076
 3077                            if annotation_file:
 3078
 3079                                # Annotation tool initial
 3080                                annotation_tool = annotation_tool_initial
 3081
 3082                                # Find file
 3083                                annotation_file_found = None
 3084
 3085                                # Expand user
 3086                                annotation_file = full_path(annotation_file)
 3087
 3088                                if os.path.exists(annotation_file):
 3089                                    annotation_file_found = annotation_file
 3090
 3091                                else:
 3092                                    # Find within assembly folders
 3093                                    for annotations_database in annotations_databases:
 3094                                        found_files = find_all(
 3095                                            annotation_file,
 3096                                            os.path.join(
 3097                                                annotations_database, assembly
 3098                                            ),
 3099                                        )
 3100                                        if len(found_files) > 0:
 3101                                            annotation_file_found = found_files[0]
 3102                                            break
 3103                                    if not annotation_file_found and not assembly:
 3104                                        # Find within folders
 3105                                        for (
 3106                                            annotations_database
 3107                                        ) in annotations_databases:
 3108                                            found_files = find_all(
 3109                                                annotation_file, annotations_database
 3110                                            )
 3111                                            if len(found_files) > 0:
 3112                                                annotation_file_found = found_files[0]
 3113                                                break
 3114                                log.debug(
 3115                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3116                                )
 3117
 3118                                # Full path
 3119                                annotation_file_found = full_path(annotation_file_found)
 3120
 3121                                if annotation_file_found:
 3122
 3123                                    database = Database(database=annotation_file_found)
 3124                                    quick_annotation_format = database.get_format()
 3125                                    quick_annotation_is_compressed = (
 3126                                        database.is_compressed()
 3127                                    )
 3128                                    quick_annotation_is_indexed = os.path.exists(
 3129                                        f"{annotation_file_found}.tbi"
 3130                                    )
 3131                                    bcftools_preference = False
 3132
 3133                                    # Check Annotation Tool
 3134                                    if not annotation_tool:
 3135                                        if (
 3136                                            bcftools_preference
 3137                                            and quick_annotation_format
 3138                                            in ["vcf", "bed"]
 3139                                            and quick_annotation_is_compressed
 3140                                            and quick_annotation_is_indexed
 3141                                        ):
 3142                                            annotation_tool = "bcftools"
 3143                                        elif quick_annotation_format in [
 3144                                            "vcf",
 3145                                            "bed",
 3146                                            "tsv",
 3147                                            "tsv",
 3148                                            "csv",
 3149                                            "json",
 3150                                            "tbl",
 3151                                            "parquet",
 3152                                            "duckdb",
 3153                                        ]:
 3154                                            annotation_tool = "parquet"
 3155                                        else:
 3156                                            log.error(
 3157                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3158                                            )
 3159                                            raise ValueError(
 3160                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3161                                            )
 3162
 3163                                    log.debug(
 3164                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3165                                    )
 3166
 3167                                    # Annotation Tool dispatch
 3168                                    if annotation_tool:
 3169                                        if annotation_tool not in param["annotation"]:
 3170                                            param["annotation"][annotation_tool] = {}
 3171                                        if (
 3172                                            "annotations"
 3173                                            not in param["annotation"][annotation_tool]
 3174                                        ):
 3175                                            param["annotation"][annotation_tool][
 3176                                                "annotations"
 3177                                            ] = {}
 3178                                        param["annotation"][annotation_tool][
 3179                                            "annotations"
 3180                                        ][annotation_file_found] = annotations
 3181
 3182                                else:
 3183                                    log.error(
 3184                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3185                                    )
 3186
 3187                self.set_param(param)
 3188
 3189        if param.get("annotation", None):
 3190            log.info("Annotations")
 3191            if param.get("annotation", {}).get("parquet", None):
 3192                log.info("Annotations 'parquet'...")
 3193                self.annotation_parquet()
 3194            if param.get("annotation", {}).get("bcftools", None):
 3195                log.info("Annotations 'bcftools'...")
 3196                self.annotation_bcftools()
 3197            if param.get("annotation", {}).get("snpsift", None):
 3198                log.info("Annotations 'snpsift'...")
 3199                self.annotation_snpsift()
 3200            if param.get("annotation", {}).get("annovar", None):
 3201                log.info("Annotations 'annovar'...")
 3202                self.annotation_annovar()
 3203            if param.get("annotation", {}).get("snpeff", None):
 3204                log.info("Annotations 'snpeff'...")
 3205                self.annotation_snpeff()
 3206            if param.get("annotation", {}).get("exomiser", None) is not None:
 3207                log.info("Annotations 'exomiser'...")
 3208                self.annotation_exomiser()
 3209            if param.get("annotation", {}).get("splice", None) is not None:
 3210                log.info("Annotations 'splice' ...")
 3211                self.annotation_splice()
 3212
 3213        # Explode INFOS fields into table fields
 3214        if self.get_explode_infos():
 3215            self.explode_infos(
 3216                prefix=self.get_explode_infos_prefix(),
 3217                fields=self.get_explode_infos_fields(),
 3218                force=True,
 3219            )
 3220
 3221    def annotation_snpsift(self, threads: int = None) -> None:
 3222        """
 3223        This function annotate with bcftools
 3224
 3225        :param threads: Number of threads to use
 3226        :return: the value of the variable "return_value".
 3227        """
 3228
 3229        # DEBUG
 3230        log.debug("Start annotation with bcftools databases")
 3231
 3232        # Threads
 3233        if not threads:
 3234            threads = self.get_threads()
 3235        log.debug("Threads: " + str(threads))
 3236
 3237        # Config
 3238        config = self.get_config()
 3239        log.debug("Config: " + str(config))
 3240
 3241        # Config - snpSift
 3242        snpsift_bin_command = get_bin_command(
 3243            bin="SnpSift.jar",
 3244            tool="snpsift",
 3245            bin_type="jar",
 3246            config=config,
 3247            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3248        )
 3249        if not snpsift_bin_command:
 3250            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3251            log.error(msg_err)
 3252            raise ValueError(msg_err)
 3253
 3254        # Config - bcftools
 3255        bcftools_bin_command = get_bin_command(
 3256            bin="bcftools",
 3257            tool="bcftools",
 3258            bin_type="bin",
 3259            config=config,
 3260            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3261        )
 3262        if not bcftools_bin_command:
 3263            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3264            log.error(msg_err)
 3265            raise ValueError(msg_err)
 3266
 3267        # Config - BCFTools databases folders
 3268        databases_folders = set(
 3269            self.get_config()
 3270            .get("folders", {})
 3271            .get("databases", {})
 3272            .get("annotations", ["."])
 3273            + self.get_config()
 3274            .get("folders", {})
 3275            .get("databases", {})
 3276            .get("bcftools", ["."])
 3277        )
 3278        log.debug("Databases annotations: " + str(databases_folders))
 3279
 3280        # Param
 3281        annotations = (
 3282            self.get_param()
 3283            .get("annotation", {})
 3284            .get("snpsift", {})
 3285            .get("annotations", None)
 3286        )
 3287        log.debug("Annotations: " + str(annotations))
 3288
 3289        # Assembly
 3290        assembly = self.get_param().get(
 3291            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3292        )
 3293
 3294        # Data
 3295        table_variants = self.get_table_variants()
 3296
 3297        # Check if not empty
 3298        log.debug("Check if not empty")
 3299        sql_query_chromosomes = (
 3300            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3301        )
 3302        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3303        if not sql_query_chromosomes_df["count"][0]:
 3304            log.info(f"VCF empty")
 3305            return
 3306
 3307        # VCF header
 3308        vcf_reader = self.get_header()
 3309        log.debug("Initial header: " + str(vcf_reader.infos))
 3310
 3311        # Existing annotations
 3312        for vcf_annotation in self.get_header().infos:
 3313
 3314            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3315            log.debug(
 3316                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3317            )
 3318
 3319        if annotations:
 3320
 3321            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3322
 3323                # Export VCF file
 3324                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3325
 3326                # Init
 3327                commands = {}
 3328
 3329                for annotation in annotations:
 3330                    annotation_fields = annotations[annotation]
 3331
 3332                    # Annotation Name
 3333                    annotation_name = os.path.basename(annotation)
 3334
 3335                    if not annotation_fields:
 3336                        annotation_fields = {"INFO": None}
 3337
 3338                    log.debug(f"Annotation '{annotation_name}'")
 3339                    log.debug(
 3340                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3341                    )
 3342
 3343                    # Create Database
 3344                    database = Database(
 3345                        database=annotation,
 3346                        databases_folders=databases_folders,
 3347                        assembly=assembly,
 3348                    )
 3349
 3350                    # Find files
 3351                    db_file = database.get_database()
 3352                    db_file = full_path(db_file)
 3353                    db_hdr_file = database.get_header_file()
 3354                    db_hdr_file = full_path(db_hdr_file)
 3355                    db_file_type = database.get_format()
 3356                    db_tbi_file = f"{db_file}.tbi"
 3357                    db_file_compressed = database.is_compressed()
 3358
 3359                    # Check if compressed
 3360                    if not db_file_compressed:
 3361                        log.error(
 3362                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3363                        )
 3364                        raise ValueError(
 3365                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3366                        )
 3367
 3368                    # Check if indexed
 3369                    if not os.path.exists(db_tbi_file):
 3370                        log.error(
 3371                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3372                        )
 3373                        raise ValueError(
 3374                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3375                        )
 3376
 3377                    # Check index - try to create if not exists
 3378                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3379                        log.error("Annotation failed: database not valid")
 3380                        log.error(f"Annotation annotation file: {db_file}")
 3381                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3382                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3383                        raise ValueError(
 3384                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3385                        )
 3386                    else:
 3387
 3388                        log.debug(
 3389                            f"Annotation '{annotation}' - file: "
 3390                            + str(db_file)
 3391                            + " and "
 3392                            + str(db_hdr_file)
 3393                        )
 3394
 3395                        # Load header as VCF object
 3396                        db_hdr_vcf = Variants(input=db_hdr_file)
 3397                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3398                        log.debug(
 3399                            "Annotation database header: "
 3400                            + str(db_hdr_vcf_header_infos)
 3401                        )
 3402
 3403                        # For all fields in database
 3404                        annotation_fields_full = False
 3405                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3406                            annotation_fields = {
 3407                                key: key for key in db_hdr_vcf_header_infos
 3408                            }
 3409                            log.debug(
 3410                                "Annotation database header - All annotations added: "
 3411                                + str(annotation_fields)
 3412                            )
 3413                            annotation_fields_full = True
 3414
 3415                        # # Create file for field rename
 3416                        # log.debug("Create file for field rename")
 3417                        # tmp_rename = NamedTemporaryFile(
 3418                        #     prefix=self.get_prefix(),
 3419                        #     dir=self.get_tmp_dir(),
 3420                        #     suffix=".rename",
 3421                        #     delete=False,
 3422                        # )
 3423                        # tmp_rename_name = tmp_rename.name
 3424                        # tmp_files.append(tmp_rename_name)
 3425
 3426                        # Number of fields
 3427                        nb_annotation_field = 0
 3428                        annotation_list = []
 3429                        annotation_infos_rename_list = []
 3430
 3431                        for annotation_field in annotation_fields:
 3432
 3433                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3434                            annotation_fields_new_name = annotation_fields.get(
 3435                                annotation_field, annotation_field
 3436                            )
 3437                            if not annotation_fields_new_name:
 3438                                annotation_fields_new_name = annotation_field
 3439
 3440                            # Check if field is in DB and if field is not elready in input data
 3441                            if (
 3442                                annotation_field in db_hdr_vcf.get_header().infos
 3443                                and annotation_fields_new_name
 3444                                not in self.get_header().infos
 3445                            ):
 3446
 3447                                log.info(
 3448                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3449                                )
 3450
 3451                                # BCFTools annotate param to rename fields
 3452                                if annotation_field != annotation_fields_new_name:
 3453                                    annotation_infos_rename_list.append(
 3454                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3455                                    )
 3456
 3457                                # Add INFO field to header
 3458                                db_hdr_vcf_header_infos_number = (
 3459                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3460                                )
 3461                                db_hdr_vcf_header_infos_type = (
 3462                                    db_hdr_vcf_header_infos[annotation_field].type
 3463                                    or "String"
 3464                                )
 3465                                db_hdr_vcf_header_infos_description = (
 3466                                    db_hdr_vcf_header_infos[annotation_field].desc
 3467                                    or f"{annotation_field} description"
 3468                                )
 3469                                db_hdr_vcf_header_infos_source = (
 3470                                    db_hdr_vcf_header_infos[annotation_field].source
 3471                                    or "unknown"
 3472                                )
 3473                                db_hdr_vcf_header_infos_version = (
 3474                                    db_hdr_vcf_header_infos[annotation_field].version
 3475                                    or "unknown"
 3476                                )
 3477
 3478                                vcf_reader.infos[annotation_fields_new_name] = (
 3479                                    vcf.parser._Info(
 3480                                        annotation_fields_new_name,
 3481                                        db_hdr_vcf_header_infos_number,
 3482                                        db_hdr_vcf_header_infos_type,
 3483                                        db_hdr_vcf_header_infos_description,
 3484                                        db_hdr_vcf_header_infos_source,
 3485                                        db_hdr_vcf_header_infos_version,
 3486                                        self.code_type_map[
 3487                                            db_hdr_vcf_header_infos_type
 3488                                        ],
 3489                                    )
 3490                                )
 3491
 3492                                annotation_list.append(annotation_field)
 3493
 3494                                nb_annotation_field += 1
 3495
 3496                            else:
 3497
 3498                                if (
 3499                                    annotation_field
 3500                                    not in db_hdr_vcf.get_header().infos
 3501                                ):
 3502                                    log.warning(
 3503                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3504                                    )
 3505                                if (
 3506                                    annotation_fields_new_name
 3507                                    in self.get_header().infos
 3508                                ):
 3509                                    log.warning(
 3510                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3511                                    )
 3512
 3513                        log.info(
 3514                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3515                        )
 3516
 3517                        annotation_infos = ",".join(annotation_list)
 3518
 3519                        if annotation_infos != "":
 3520
 3521                            # Annotated VCF (and error file)
 3522                            tmp_annotation_vcf_name = os.path.join(
 3523                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3524                            )
 3525                            tmp_annotation_vcf_name_err = (
 3526                                tmp_annotation_vcf_name + ".err"
 3527                            )
 3528
 3529                            # Add fields to annotate
 3530                            if not annotation_fields_full:
 3531                                annotation_infos_option = f"-info {annotation_infos}"
 3532                            else:
 3533                                annotation_infos_option = ""
 3534
 3535                            # Info fields rename
 3536                            if annotation_infos_rename_list:
 3537                                annotation_infos_rename = " -c " + ",".join(
 3538                                    annotation_infos_rename_list
 3539                                )
 3540                            else:
 3541                                annotation_infos_rename = ""
 3542
 3543                            # Annotate command
 3544                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3545
 3546                            # Add command
 3547                            commands[command_annotate] = tmp_annotation_vcf_name
 3548
 3549                if commands:
 3550
 3551                    # Export VCF file
 3552                    self.export_variant_vcf(
 3553                        vcf_file=tmp_vcf_name,
 3554                        remove_info=True,
 3555                        add_samples=False,
 3556                        index=True,
 3557                    )
 3558                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3559
 3560                    # Num command
 3561                    nb_command = 0
 3562
 3563                    # Annotate
 3564                    for command_annotate in commands:
 3565                        nb_command += 1
 3566                        log.info(
 3567                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3568                        )
 3569                        log.debug(f"command_annotate={command_annotate}")
 3570                        run_parallel_commands([command_annotate], threads)
 3571
 3572                        # Debug
 3573                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3574
 3575                        # Update variants
 3576                        log.info(
 3577                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3578                        )
 3579                        self.update_from_vcf(commands[command_annotate])
 3580
 3581    def annotation_bcftools(self, threads: int = None) -> None:
 3582        """
 3583        This function annotate with bcftools
 3584
 3585        :param threads: Number of threads to use
 3586        :return: the value of the variable "return_value".
 3587        """
 3588
 3589        # DEBUG
 3590        log.debug("Start annotation with bcftools databases")
 3591
 3592        # Threads
 3593        if not threads:
 3594            threads = self.get_threads()
 3595        log.debug("Threads: " + str(threads))
 3596
 3597        # Config
 3598        config = self.get_config()
 3599        log.debug("Config: " + str(config))
 3600
 3601        # DEBUG
 3602        delete_tmp = True
 3603        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3604            delete_tmp = False
 3605            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3606
 3607        # Config - BCFTools bin command
 3608        bcftools_bin_command = get_bin_command(
 3609            bin="bcftools",
 3610            tool="bcftools",
 3611            bin_type="bin",
 3612            config=config,
 3613            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3614        )
 3615        if not bcftools_bin_command:
 3616            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3617            log.error(msg_err)
 3618            raise ValueError(msg_err)
 3619
 3620        # Config - BCFTools databases folders
 3621        databases_folders = set(
 3622            self.get_config()
 3623            .get("folders", {})
 3624            .get("databases", {})
 3625            .get("annotations", ["."])
 3626            + self.get_config()
 3627            .get("folders", {})
 3628            .get("databases", {})
 3629            .get("bcftools", ["."])
 3630        )
 3631        log.debug("Databases annotations: " + str(databases_folders))
 3632
 3633        # Param
 3634        annotations = (
 3635            self.get_param()
 3636            .get("annotation", {})
 3637            .get("bcftools", {})
 3638            .get("annotations", None)
 3639        )
 3640        log.debug("Annotations: " + str(annotations))
 3641
 3642        # Assembly
 3643        assembly = self.get_param().get(
 3644            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3645        )
 3646
 3647        # Data
 3648        table_variants = self.get_table_variants()
 3649
 3650        # Check if not empty
 3651        log.debug("Check if not empty")
 3652        sql_query_chromosomes = (
 3653            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3654        )
 3655        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3656        if not sql_query_chromosomes_df["count"][0]:
 3657            log.info(f"VCF empty")
 3658            return
 3659
 3660        # Export in VCF
 3661        log.debug("Create initial file to annotate")
 3662        tmp_vcf = NamedTemporaryFile(
 3663            prefix=self.get_prefix(),
 3664            dir=self.get_tmp_dir(),
 3665            suffix=".vcf.gz",
 3666            delete=False,
 3667        )
 3668        tmp_vcf_name = tmp_vcf.name
 3669
 3670        # VCF header
 3671        vcf_reader = self.get_header()
 3672        log.debug("Initial header: " + str(vcf_reader.infos))
 3673
 3674        # Existing annotations
 3675        for vcf_annotation in self.get_header().infos:
 3676
 3677            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3678            log.debug(
 3679                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3680            )
 3681
 3682        if annotations:
 3683
 3684            tmp_ann_vcf_list = []
 3685            commands = []
 3686            tmp_files = []
 3687            err_files = []
 3688
 3689            for annotation in annotations:
 3690                annotation_fields = annotations[annotation]
 3691
 3692                # Annotation Name
 3693                annotation_name = os.path.basename(annotation)
 3694
 3695                if not annotation_fields:
 3696                    annotation_fields = {"INFO": None}
 3697
 3698                log.debug(f"Annotation '{annotation_name}'")
 3699                log.debug(
 3700                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3701                )
 3702
 3703                # Create Database
 3704                database = Database(
 3705                    database=annotation,
 3706                    databases_folders=databases_folders,
 3707                    assembly=assembly,
 3708                )
 3709
 3710                # Find files
 3711                db_file = database.get_database()
 3712                db_file = full_path(db_file)
 3713                db_hdr_file = database.get_header_file()
 3714                db_hdr_file = full_path(db_hdr_file)
 3715                db_file_type = database.get_format()
 3716                db_tbi_file = f"{db_file}.tbi"
 3717                db_file_compressed = database.is_compressed()
 3718
 3719                # Check if compressed
 3720                if not db_file_compressed:
 3721                    log.error(
 3722                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3723                    )
 3724                    raise ValueError(
 3725                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3726                    )
 3727
 3728                # Check if indexed
 3729                if not os.path.exists(db_tbi_file):
 3730                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3731                    raise ValueError(
 3732                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3733                    )
 3734
 3735                # Check index - try to create if not exists
 3736                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3737                    log.error("Annotation failed: database not valid")
 3738                    log.error(f"Annotation annotation file: {db_file}")
 3739                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3740                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3741                    raise ValueError(
 3742                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3743                    )
 3744                else:
 3745
 3746                    log.debug(
 3747                        f"Annotation '{annotation}' - file: "
 3748                        + str(db_file)
 3749                        + " and "
 3750                        + str(db_hdr_file)
 3751                    )
 3752
 3753                    # Load header as VCF object
 3754                    db_hdr_vcf = Variants(input=db_hdr_file)
 3755                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3756                    log.debug(
 3757                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3758                    )
 3759
 3760                    # For all fields in database
 3761                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3762                        annotation_fields = {
 3763                            key: key for key in db_hdr_vcf_header_infos
 3764                        }
 3765                        log.debug(
 3766                            "Annotation database header - All annotations added: "
 3767                            + str(annotation_fields)
 3768                        )
 3769
 3770                    # Number of fields
 3771                    nb_annotation_field = 0
 3772                    annotation_list = []
 3773
 3774                    for annotation_field in annotation_fields:
 3775
 3776                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3777                        annotation_fields_new_name = annotation_fields.get(
 3778                            annotation_field, annotation_field
 3779                        )
 3780                        if not annotation_fields_new_name:
 3781                            annotation_fields_new_name = annotation_field
 3782
 3783                        # Check if field is in DB and if field is not elready in input data
 3784                        if (
 3785                            annotation_field in db_hdr_vcf.get_header().infos
 3786                            and annotation_fields_new_name
 3787                            not in self.get_header().infos
 3788                        ):
 3789
 3790                            log.info(
 3791                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3792                            )
 3793
 3794                            # Add INFO field to header
 3795                            db_hdr_vcf_header_infos_number = (
 3796                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3797                            )
 3798                            db_hdr_vcf_header_infos_type = (
 3799                                db_hdr_vcf_header_infos[annotation_field].type
 3800                                or "String"
 3801                            )
 3802                            db_hdr_vcf_header_infos_description = (
 3803                                db_hdr_vcf_header_infos[annotation_field].desc
 3804                                or f"{annotation_field} description"
 3805                            )
 3806                            db_hdr_vcf_header_infos_source = (
 3807                                db_hdr_vcf_header_infos[annotation_field].source
 3808                                or "unknown"
 3809                            )
 3810                            db_hdr_vcf_header_infos_version = (
 3811                                db_hdr_vcf_header_infos[annotation_field].version
 3812                                or "unknown"
 3813                            )
 3814
 3815                            vcf_reader.infos[annotation_fields_new_name] = (
 3816                                vcf.parser._Info(
 3817                                    annotation_fields_new_name,
 3818                                    db_hdr_vcf_header_infos_number,
 3819                                    db_hdr_vcf_header_infos_type,
 3820                                    db_hdr_vcf_header_infos_description,
 3821                                    db_hdr_vcf_header_infos_source,
 3822                                    db_hdr_vcf_header_infos_version,
 3823                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3824                                )
 3825                            )
 3826
 3827                            # annotation_list.append(annotation_field)
 3828                            if annotation_field != annotation_fields_new_name:
 3829                                annotation_list.append(
 3830                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3831                                )
 3832                            else:
 3833                                annotation_list.append(annotation_field)
 3834
 3835                            nb_annotation_field += 1
 3836
 3837                        else:
 3838
 3839                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3840                                log.warning(
 3841                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3842                                )
 3843                            if annotation_fields_new_name in self.get_header().infos:
 3844                                log.warning(
 3845                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3846                                )
 3847
 3848                    log.info(
 3849                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3850                    )
 3851
 3852                    annotation_infos = ",".join(annotation_list)
 3853
 3854                    if annotation_infos != "":
 3855
 3856                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3857                        log.debug("Protect Header file - remove #CHROM line if exists")
 3858                        tmp_header_vcf = NamedTemporaryFile(
 3859                            prefix=self.get_prefix(),
 3860                            dir=self.get_tmp_dir(),
 3861                            suffix=".hdr",
 3862                            delete=False,
 3863                        )
 3864                        tmp_header_vcf_name = tmp_header_vcf.name
 3865                        tmp_files.append(tmp_header_vcf_name)
 3866                        # Command
 3867                        if db_hdr_file.endswith(".gz"):
 3868                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3869                        else:
 3870                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3871                        # Run
 3872                        run_parallel_commands([command_extract_header], 1)
 3873
 3874                        # Find chomosomes
 3875                        log.debug("Find chromosomes ")
 3876                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3877                        sql_query_chromosomes_df = self.get_query_to_df(
 3878                            sql_query_chromosomes
 3879                        )
 3880                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3881
 3882                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3883
 3884                        # BED columns in the annotation file
 3885                        if db_file_type in ["bed"]:
 3886                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3887
 3888                        for chrom in chomosomes_list:
 3889
 3890                            # Create BED on initial VCF
 3891                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3892                            tmp_bed = NamedTemporaryFile(
 3893                                prefix=self.get_prefix(),
 3894                                dir=self.get_tmp_dir(),
 3895                                suffix=".bed",
 3896                                delete=False,
 3897                            )
 3898                            tmp_bed_name = tmp_bed.name
 3899                            tmp_files.append(tmp_bed_name)
 3900
 3901                            # Detecte regions
 3902                            log.debug(
 3903                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3904                            )
 3905                            window = 1000000
 3906                            sql_query_intervals_for_bed = f"""
 3907                                SELECT  \"#CHROM\",
 3908                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3909                                        \"POS\"+{window}
 3910                                FROM {table_variants} as table_variants
 3911                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3912                            """
 3913                            regions = self.conn.execute(
 3914                                sql_query_intervals_for_bed
 3915                            ).fetchall()
 3916                            merged_regions = merge_regions(regions)
 3917                            log.debug(
 3918                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3919                            )
 3920
 3921                            header = ["#CHROM", "START", "END"]
 3922                            with open(tmp_bed_name, "w") as f:
 3923                                # Write the header with tab delimiter
 3924                                f.write("\t".join(header) + "\n")
 3925                                for d in merged_regions:
 3926                                    # Write each data row with tab delimiter
 3927                                    f.write("\t".join(map(str, d)) + "\n")
 3928
 3929                            # Tmp files
 3930                            tmp_annotation_vcf = NamedTemporaryFile(
 3931                                prefix=self.get_prefix(),
 3932                                dir=self.get_tmp_dir(),
 3933                                suffix=".vcf.gz",
 3934                                delete=False,
 3935                            )
 3936                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3937                            tmp_files.append(tmp_annotation_vcf_name)
 3938                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3939                            tmp_annotation_vcf_name_err = (
 3940                                tmp_annotation_vcf_name + ".err"
 3941                            )
 3942                            err_files.append(tmp_annotation_vcf_name_err)
 3943
 3944                            # Annotate Command
 3945                            log.debug(
 3946                                f"Annotation '{annotation}' - add bcftools command"
 3947                            )
 3948
 3949                            # Command
 3950                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3951
 3952                            # Add command
 3953                            commands.append(command_annotate)
 3954
 3955            # if some commands
 3956            if commands:
 3957
 3958                # Export VCF file
 3959                self.export_variant_vcf(
 3960                    vcf_file=tmp_vcf_name,
 3961                    remove_info=True,
 3962                    add_samples=False,
 3963                    index=True,
 3964                )
 3965
 3966                # Threads
 3967                # calculate threads for annotated commands
 3968                if commands:
 3969                    threads_bcftools_annotate = round(threads / len(commands))
 3970                else:
 3971                    threads_bcftools_annotate = 1
 3972
 3973                if not threads_bcftools_annotate:
 3974                    threads_bcftools_annotate = 1
 3975
 3976                # Add threads option to bcftools commands
 3977                if threads_bcftools_annotate > 1:
 3978                    commands_threaded = []
 3979                    for command in commands:
 3980                        commands_threaded.append(
 3981                            command.replace(
 3982                                f"{bcftools_bin_command} annotate ",
 3983                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3984                            )
 3985                        )
 3986                    commands = commands_threaded
 3987
 3988                # Command annotation multithreading
 3989                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3990                log.info(
 3991                    f"Annotation - Annotation multithreaded in "
 3992                    + str(len(commands))
 3993                    + " commands"
 3994                )
 3995
 3996                run_parallel_commands(commands, threads)
 3997
 3998                # Merge
 3999                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4000
 4001                if tmp_ann_vcf_list_cmd:
 4002
 4003                    # Tmp file
 4004                    tmp_annotate_vcf = NamedTemporaryFile(
 4005                        prefix=self.get_prefix(),
 4006                        dir=self.get_tmp_dir(),
 4007                        suffix=".vcf.gz",
 4008                        delete=True,
 4009                    )
 4010                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4011                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4012                    err_files.append(tmp_annotate_vcf_name_err)
 4013
 4014                    # Tmp file remove command
 4015                    tmp_files_remove_command = ""
 4016                    if tmp_files:
 4017                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4018
 4019                    # Command merge
 4020                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4021                    log.info(
 4022                        f"Annotation - Annotation merging "
 4023                        + str(len(commands))
 4024                        + " annotated files"
 4025                    )
 4026                    log.debug(f"Annotation - merge command: {merge_command}")
 4027                    run_parallel_commands([merge_command], 1)
 4028
 4029                    # Error messages
 4030                    log.info(f"Error/Warning messages:")
 4031                    error_message_command_all = []
 4032                    error_message_command_warning = []
 4033                    error_message_command_err = []
 4034                    for err_file in err_files:
 4035                        with open(err_file, "r") as f:
 4036                            for line in f:
 4037                                message = line.strip()
 4038                                error_message_command_all.append(message)
 4039                                if line.startswith("[W::"):
 4040                                    error_message_command_warning.append(message)
 4041                                if line.startswith("[E::"):
 4042                                    error_message_command_err.append(
 4043                                        f"{err_file}: " + message
 4044                                    )
 4045                    # log info
 4046                    for message in list(
 4047                        set(error_message_command_err + error_message_command_warning)
 4048                    ):
 4049                        log.info(f"   {message}")
 4050                    # debug info
 4051                    for message in list(set(error_message_command_all)):
 4052                        log.debug(f"   {message}")
 4053                    # failed
 4054                    if len(error_message_command_err):
 4055                        log.error("Annotation failed: Error in commands")
 4056                        raise ValueError("Annotation failed: Error in commands")
 4057
 4058                    # Update variants
 4059                    log.info(f"Annotation - Updating...")
 4060                    self.update_from_vcf(tmp_annotate_vcf_name)
 4061
 4062    def annotation_exomiser(self, threads: int = None) -> None:
 4063        """
 4064        This function annotate with Exomiser
 4065
 4066        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4067        - "analysis" (dict/file):
 4068            Full analysis dictionnary parameters (see Exomiser docs).
 4069            Either a dict, or a file in JSON or YAML format.
 4070            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4071            Default : None
 4072        - "preset" (string):
 4073            Analysis preset (available in config folder).
 4074            Used if no full "analysis" is provided.
 4075            Default: "exome"
 4076        - "phenopacket" (dict/file):
 4077            Samples and phenotipic features parameters (see Exomiser docs).
 4078            Either a dict, or a file in JSON or YAML format.
 4079            Default: None
 4080        - "subject" (dict):
 4081            Sample parameters (see Exomiser docs).
 4082            Example:
 4083                "subject":
 4084                    {
 4085                        "id": "ISDBM322017",
 4086                        "sex": "FEMALE"
 4087                    }
 4088            Default: None
 4089        - "sample" (string):
 4090            Sample name to construct "subject" section:
 4091                "subject":
 4092                    {
 4093                        "id": "<sample>",
 4094                        "sex": "UNKNOWN_SEX"
 4095                    }
 4096            Default: None
 4097        - "phenotypicFeatures" (dict)
 4098            Phenotypic features to construct "subject" section.
 4099            Example:
 4100                "phenotypicFeatures":
 4101                    [
 4102                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4103                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4104                    ]
 4105        - "hpo" (list)
 4106            List of HPO ids as phenotypic features.
 4107            Example:
 4108                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4109            Default: []
 4110        - "outputOptions" (dict):
 4111            Output options (see Exomiser docs).
 4112            Default:
 4113                "output_options" =
 4114                    {
 4115                        "outputContributingVariantsOnly": False,
 4116                        "numGenes": 0,
 4117                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4118                    }
 4119        - "transcript_source" (string):
 4120            Transcript source (either "refseq", "ucsc", "ensembl")
 4121            Default: "refseq"
 4122        - "exomiser_to_info" (boolean):
 4123            Add exomiser TSV file columns as INFO fields in VCF.
 4124            Default: False
 4125        - "release" (string):
 4126            Exomise database release.
 4127            If not exists, database release will be downloaded (take a while).
 4128            Default: None (provided by application.properties configuration file)
 4129        - "exomiser_application_properties" (file):
 4130            Exomiser configuration file (see Exomiser docs).
 4131            Useful to automatically download databases (especially for specific genome databases).
 4132
 4133        Notes:
 4134        - If no sample in parameters, first sample in VCF will be chosen
 4135        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4136
 4137        :param threads: The number of threads to use
 4138        :return: None.
 4139        """
 4140
 4141        # DEBUG
 4142        log.debug("Start annotation with Exomiser databases")
 4143
 4144        # Threads
 4145        if not threads:
 4146            threads = self.get_threads()
 4147        log.debug("Threads: " + str(threads))
 4148
 4149        # Config
 4150        config = self.get_config()
 4151        log.debug("Config: " + str(config))
 4152
 4153        # Config - Folders - Databases
 4154        databases_folders = (
 4155            config.get("folders", {})
 4156            .get("databases", {})
 4157            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4158        )
 4159        databases_folders = full_path(databases_folders)
 4160        if not os.path.exists(databases_folders):
 4161            log.error(f"Databases annotations: {databases_folders} NOT found")
 4162        log.debug("Databases annotations: " + str(databases_folders))
 4163
 4164        # Config - Exomiser
 4165        exomiser_bin_command = get_bin_command(
 4166            bin="exomiser-cli*.jar",
 4167            tool="exomiser",
 4168            bin_type="jar",
 4169            config=config,
 4170            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4171        )
 4172        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4173        if not exomiser_bin_command:
 4174            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4175            log.error(msg_err)
 4176            raise ValueError(msg_err)
 4177
 4178        # Param
 4179        param = self.get_param()
 4180        log.debug("Param: " + str(param))
 4181
 4182        # Param - Exomiser
 4183        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4184        log.debug(f"Param Exomiser: {param_exomiser}")
 4185
 4186        # Param - Assembly
 4187        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4188        log.debug("Assembly: " + str(assembly))
 4189
 4190        # Data
 4191        table_variants = self.get_table_variants()
 4192
 4193        # Check if not empty
 4194        log.debug("Check if not empty")
 4195        sql_query_chromosomes = (
 4196            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4197        )
 4198        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4199            log.info(f"VCF empty")
 4200            return False
 4201
 4202        # VCF header
 4203        vcf_reader = self.get_header()
 4204        log.debug("Initial header: " + str(vcf_reader.infos))
 4205
 4206        # Samples
 4207        samples = self.get_header_sample_list()
 4208        if not samples:
 4209            log.error("No Samples in VCF")
 4210            return False
 4211        log.debug(f"Samples: {samples}")
 4212
 4213        # Memory limit
 4214        memory_limit = self.get_memory("8G")
 4215        log.debug(f"memory_limit: {memory_limit}")
 4216
 4217        # Exomiser java options
 4218        exomiser_java_options = (
 4219            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4220        )
 4221        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4222
 4223        # Download Exomiser (if not exists)
 4224        exomiser_release = param_exomiser.get("release", None)
 4225        exomiser_application_properties = param_exomiser.get(
 4226            "exomiser_application_properties", None
 4227        )
 4228        databases_download_exomiser(
 4229            assemblies=[assembly],
 4230            exomiser_folder=databases_folders,
 4231            exomiser_release=exomiser_release,
 4232            exomiser_phenotype_release=exomiser_release,
 4233            exomiser_application_properties=exomiser_application_properties,
 4234        )
 4235
 4236        # Force annotation
 4237        force_update_annotation = True
 4238
 4239        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4240            log.debug("Start annotation Exomiser")
 4241
 4242            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4243
 4244                # tmp_dir = "/tmp/exomiser"
 4245
 4246                ### ANALYSIS ###
 4247                ################
 4248
 4249                # Create analysis.json through analysis dict
 4250                # either analysis in param or by default
 4251                # depending on preset exome/genome)
 4252
 4253                # Init analysis dict
 4254                param_exomiser_analysis_dict = {}
 4255
 4256                # analysis from param
 4257                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4258                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4259
 4260                # If analysis in param -> load anlaysis json
 4261                if param_exomiser_analysis:
 4262
 4263                    # If param analysis is a file and exists
 4264                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4265                        param_exomiser_analysis
 4266                    ):
 4267                        # Load analysis file into analysis dict (either yaml or json)
 4268                        with open(param_exomiser_analysis) as json_file:
 4269                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4270
 4271                    # If param analysis is a dict
 4272                    elif isinstance(param_exomiser_analysis, dict):
 4273                        # Load analysis dict into analysis dict (either yaml or json)
 4274                        param_exomiser_analysis_dict = param_exomiser_analysis
 4275
 4276                    # Error analysis type
 4277                    else:
 4278                        log.error(f"Analysis type unknown. Check param file.")
 4279                        raise ValueError(f"Analysis type unknown. Check param file.")
 4280
 4281                # Case no input analysis config file/dict
 4282                # Use preset (exome/genome) to open default config file
 4283                if not param_exomiser_analysis_dict:
 4284
 4285                    # default preset
 4286                    default_preset = "exome"
 4287
 4288                    # Get param preset or default preset
 4289                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4290
 4291                    # Try to find if preset is a file
 4292                    if os.path.exists(param_exomiser_preset):
 4293                        # Preset file is provided in full path
 4294                        param_exomiser_analysis_default_config_file = (
 4295                            param_exomiser_preset
 4296                        )
 4297                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4298                    #     # Preset file is provided in full path
 4299                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4300                    elif os.path.exists(
 4301                        os.path.join(folder_config, param_exomiser_preset)
 4302                    ):
 4303                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4304                        param_exomiser_analysis_default_config_file = os.path.join(
 4305                            folder_config, param_exomiser_preset
 4306                        )
 4307                    else:
 4308                        # Construct preset file
 4309                        param_exomiser_analysis_default_config_file = os.path.join(
 4310                            folder_config,
 4311                            f"preset-{param_exomiser_preset}-analysis.json",
 4312                        )
 4313
 4314                    # If preset file exists
 4315                    param_exomiser_analysis_default_config_file = full_path(
 4316                        param_exomiser_analysis_default_config_file
 4317                    )
 4318                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4319                        # Load prest file into analysis dict (either yaml or json)
 4320                        with open(
 4321                            param_exomiser_analysis_default_config_file
 4322                        ) as json_file:
 4323                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4324                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4325                                json_file
 4326                            )
 4327
 4328                    # Error preset file
 4329                    else:
 4330                        log.error(
 4331                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4332                        )
 4333                        raise ValueError(
 4334                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4335                        )
 4336
 4337                # If no analysis dict created
 4338                if not param_exomiser_analysis_dict:
 4339                    log.error(f"No analysis config")
 4340                    raise ValueError(f"No analysis config")
 4341
 4342                # Log
 4343                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4344
 4345                ### PHENOPACKET ###
 4346                ###################
 4347
 4348                # If no PhenoPacket in analysis dict -> check in param
 4349                if "phenopacket" not in param_exomiser_analysis_dict:
 4350
 4351                    # If PhenoPacket in param -> load anlaysis json
 4352                    if param_exomiser.get("phenopacket", None):
 4353
 4354                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4355                        param_exomiser_phenopacket = full_path(
 4356                            param_exomiser_phenopacket
 4357                        )
 4358
 4359                        # If param phenopacket is a file and exists
 4360                        if isinstance(
 4361                            param_exomiser_phenopacket, str
 4362                        ) and os.path.exists(param_exomiser_phenopacket):
 4363                            # Load phenopacket file into analysis dict (either yaml or json)
 4364                            with open(param_exomiser_phenopacket) as json_file:
 4365                                param_exomiser_analysis_dict["phenopacket"] = (
 4366                                    yaml.safe_load(json_file)
 4367                                )
 4368
 4369                        # If param phenopacket is a dict
 4370                        elif isinstance(param_exomiser_phenopacket, dict):
 4371                            # Load phenopacket dict into analysis dict (either yaml or json)
 4372                            param_exomiser_analysis_dict["phenopacket"] = (
 4373                                param_exomiser_phenopacket
 4374                            )
 4375
 4376                        # Error phenopacket type
 4377                        else:
 4378                            log.error(f"Phenopacket type unknown. Check param file.")
 4379                            raise ValueError(
 4380                                f"Phenopacket type unknown. Check param file."
 4381                            )
 4382
 4383                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4384                if "phenopacket" not in param_exomiser_analysis_dict:
 4385
 4386                    # Init PhenoPacket
 4387                    param_exomiser_analysis_dict["phenopacket"] = {
 4388                        "id": "analysis",
 4389                        "proband": {},
 4390                    }
 4391
 4392                    ### Add subject ###
 4393
 4394                    # If subject exists
 4395                    param_exomiser_subject = param_exomiser.get("subject", {})
 4396
 4397                    # If subject not exists -> found sample ID
 4398                    if not param_exomiser_subject:
 4399
 4400                        # Found sample ID in param
 4401                        sample = param_exomiser.get("sample", None)
 4402
 4403                        # Find sample ID (first sample)
 4404                        if not sample:
 4405                            sample_list = self.get_header_sample_list()
 4406                            if len(sample_list) > 0:
 4407                                sample = sample_list[0]
 4408                            else:
 4409                                log.error(f"No sample found")
 4410                                raise ValueError(f"No sample found")
 4411
 4412                        # Create subject
 4413                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4414
 4415                    # Add to dict
 4416                    param_exomiser_analysis_dict["phenopacket"][
 4417                        "subject"
 4418                    ] = param_exomiser_subject
 4419
 4420                    ### Add "phenotypicFeatures" ###
 4421
 4422                    # If phenotypicFeatures exists
 4423                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4424                        "phenotypicFeatures", []
 4425                    )
 4426
 4427                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4428                    if not param_exomiser_phenotypicfeatures:
 4429
 4430                        # Found HPO in param
 4431                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4432
 4433                        # Split HPO if list in string format separated by comma
 4434                        if isinstance(param_exomiser_hpo, str):
 4435                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4436
 4437                        # Create HPO list
 4438                        for hpo in param_exomiser_hpo:
 4439                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4440                            param_exomiser_phenotypicfeatures.append(
 4441                                {
 4442                                    "type": {
 4443                                        "id": f"HP:{hpo_clean}",
 4444                                        "label": f"HP:{hpo_clean}",
 4445                                    }
 4446                                }
 4447                            )
 4448
 4449                    # Add to dict
 4450                    param_exomiser_analysis_dict["phenopacket"][
 4451                        "phenotypicFeatures"
 4452                    ] = param_exomiser_phenotypicfeatures
 4453
 4454                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4455                    if not param_exomiser_phenotypicfeatures:
 4456                        for step in param_exomiser_analysis_dict.get(
 4457                            "analysis", {}
 4458                        ).get("steps", []):
 4459                            if "hiPhivePrioritiser" in step:
 4460                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4461                                    "steps", []
 4462                                ).remove(step)
 4463
 4464                ### Add Input File ###
 4465
 4466                # Initial file name and htsFiles
 4467                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4468                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4469                    {
 4470                        "uri": tmp_vcf_name,
 4471                        "htsFormat": "VCF",
 4472                        "genomeAssembly": assembly,
 4473                    }
 4474                ]
 4475
 4476                ### Add metaData ###
 4477
 4478                # If metaData not in analysis dict
 4479                if "metaData" not in param_exomiser_analysis_dict:
 4480                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4481                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4482                        "createdBy": "howard",
 4483                        "phenopacketSchemaVersion": 1,
 4484                    }
 4485
 4486                ### OutputOptions ###
 4487
 4488                # Init output result folder
 4489                output_results = os.path.join(tmp_dir, "results")
 4490
 4491                # If no outputOptions in analysis dict
 4492                if "outputOptions" not in param_exomiser_analysis_dict:
 4493
 4494                    # default output formats
 4495                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4496
 4497                    # Get outputOptions in param
 4498                    output_options = param_exomiser.get("outputOptions", None)
 4499
 4500                    # If no output_options in param -> check
 4501                    if not output_options:
 4502                        output_options = {
 4503                            "outputContributingVariantsOnly": False,
 4504                            "numGenes": 0,
 4505                            "outputFormats": defaut_output_formats,
 4506                        }
 4507
 4508                    # Replace outputDirectory in output options
 4509                    output_options["outputDirectory"] = output_results
 4510                    output_options["outputFileName"] = "howard"
 4511
 4512                    # Add outputOptions in analysis dict
 4513                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4514
 4515                else:
 4516
 4517                    # Replace output_results and output format (if exists in param)
 4518                    param_exomiser_analysis_dict["outputOptions"][
 4519                        "outputDirectory"
 4520                    ] = output_results
 4521                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4522                        list(
 4523                            set(
 4524                                param_exomiser_analysis_dict.get(
 4525                                    "outputOptions", {}
 4526                                ).get("outputFormats", [])
 4527                                + ["TSV_VARIANT", "VCF"]
 4528                            )
 4529                        )
 4530                    )
 4531
 4532                # log
 4533                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4534
 4535                ### ANALYSIS FILE ###
 4536                #####################
 4537
 4538                ### Full JSON analysis config file ###
 4539
 4540                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4541                with open(exomiser_analysis, "w") as fp:
 4542                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4543
 4544                ### SPLIT analysis and sample config files
 4545
 4546                # Splitted analysis dict
 4547                param_exomiser_analysis_dict_for_split = (
 4548                    param_exomiser_analysis_dict.copy()
 4549                )
 4550
 4551                # Phenopacket JSON file
 4552                exomiser_analysis_phenopacket = os.path.join(
 4553                    tmp_dir, "analysis_phenopacket.json"
 4554                )
 4555                with open(exomiser_analysis_phenopacket, "w") as fp:
 4556                    json.dump(
 4557                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4558                        fp,
 4559                        indent=4,
 4560                    )
 4561
 4562                # Analysis JSON file without Phenopacket parameters
 4563                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4564                exomiser_analysis_analysis = os.path.join(
 4565                    tmp_dir, "analysis_analysis.json"
 4566                )
 4567                with open(exomiser_analysis_analysis, "w") as fp:
 4568                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4569
 4570                ### INITAL VCF file ###
 4571                #######################
 4572
 4573                ### Create list of samples to use and include inti initial VCF file ####
 4574
 4575                # Subject (main sample)
 4576                # Get sample ID in analysis dict
 4577                sample_subject = (
 4578                    param_exomiser_analysis_dict.get("phenopacket", {})
 4579                    .get("subject", {})
 4580                    .get("id", None)
 4581                )
 4582                sample_proband = (
 4583                    param_exomiser_analysis_dict.get("phenopacket", {})
 4584                    .get("proband", {})
 4585                    .get("subject", {})
 4586                    .get("id", None)
 4587                )
 4588                sample = []
 4589                if sample_subject:
 4590                    sample.append(sample_subject)
 4591                if sample_proband:
 4592                    sample.append(sample_proband)
 4593
 4594                # Get sample ID within Pedigree
 4595                pedigree_persons_list = (
 4596                    param_exomiser_analysis_dict.get("phenopacket", {})
 4597                    .get("pedigree", {})
 4598                    .get("persons", {})
 4599                )
 4600
 4601                # Create list with all sample ID in pedigree (if exists)
 4602                pedigree_persons = []
 4603                for person in pedigree_persons_list:
 4604                    pedigree_persons.append(person.get("individualId"))
 4605
 4606                # Concat subject sample ID and samples ID in pedigreesamples
 4607                samples = list(set(sample + pedigree_persons))
 4608
 4609                # Check if sample list is not empty
 4610                if not samples:
 4611                    log.error(f"No samples found")
 4612                    raise ValueError(f"No samples found")
 4613
 4614                # Create VCF with sample (either sample in param or first one by default)
 4615                # Export VCF file
 4616                self.export_variant_vcf(
 4617                    vcf_file=tmp_vcf_name,
 4618                    remove_info=True,
 4619                    add_samples=True,
 4620                    list_samples=samples,
 4621                    index=False,
 4622                )
 4623
 4624                ### Execute Exomiser ###
 4625                ########################
 4626
 4627                # Init command
 4628                exomiser_command = ""
 4629
 4630                # Command exomiser options
 4631                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4632
 4633                # Release
 4634                exomiser_release = param_exomiser.get("release", None)
 4635                if exomiser_release:
 4636                    # phenotype data version
 4637                    exomiser_options += (
 4638                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4639                    )
 4640                    # data version
 4641                    exomiser_options += (
 4642                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4643                    )
 4644                    # variant white list
 4645                    variant_white_list_file = (
 4646                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4647                    )
 4648                    if os.path.exists(
 4649                        os.path.join(
 4650                            databases_folders, assembly, variant_white_list_file
 4651                        )
 4652                    ):
 4653                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4654
 4655                # transcript_source
 4656                transcript_source = param_exomiser.get(
 4657                    "transcript_source", None
 4658                )  # ucsc, refseq, ensembl
 4659                if transcript_source:
 4660                    exomiser_options += (
 4661                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4662                    )
 4663
 4664                # If analysis contain proband param
 4665                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4666                    "proband", {}
 4667                ):
 4668                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4669
 4670                # If no proband (usually uniq sample)
 4671                else:
 4672                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4673
 4674                # Log
 4675                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4676
 4677                # Run command
 4678                result = subprocess.call(
 4679                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4680                )
 4681                if result:
 4682                    log.error("Exomiser command failed")
 4683                    raise ValueError("Exomiser command failed")
 4684
 4685                ### RESULTS ###
 4686                ###############
 4687
 4688                ### Annotate with TSV fields ###
 4689
 4690                # Init result tsv file
 4691                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4692
 4693                # Init result tsv file
 4694                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4695
 4696                # Parse TSV file and explode columns in INFO field
 4697                if exomiser_to_info and os.path.exists(output_results_tsv):
 4698
 4699                    # Log
 4700                    log.debug("Exomiser columns to VCF INFO field")
 4701
 4702                    # Retrieve columns and types
 4703                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4704                    output_results_tsv_df = self.get_query_to_df(query)
 4705                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4706
 4707                    # Init concat fields for update
 4708                    sql_query_update_concat_fields = []
 4709
 4710                    # Fields to avoid
 4711                    fields_to_avoid = [
 4712                        "CONTIG",
 4713                        "START",
 4714                        "END",
 4715                        "REF",
 4716                        "ALT",
 4717                        "QUAL",
 4718                        "FILTER",
 4719                        "GENOTYPE",
 4720                    ]
 4721
 4722                    # List all columns to add into header
 4723                    for header_column in output_results_tsv_columns:
 4724
 4725                        # If header column is enable
 4726                        if header_column not in fields_to_avoid:
 4727
 4728                            # Header info type
 4729                            header_info_type = "String"
 4730                            header_column_df = output_results_tsv_df[header_column]
 4731                            header_column_df_dtype = header_column_df.dtype
 4732                            if header_column_df_dtype == object:
 4733                                if (
 4734                                    pd.to_numeric(header_column_df, errors="coerce")
 4735                                    .notnull()
 4736                                    .all()
 4737                                ):
 4738                                    header_info_type = "Float"
 4739                            else:
 4740                                header_info_type = "Integer"
 4741
 4742                            # Header info
 4743                            characters_to_validate = ["-"]
 4744                            pattern = "[" + "".join(characters_to_validate) + "]"
 4745                            header_info_name = re.sub(
 4746                                pattern,
 4747                                "_",
 4748                                f"Exomiser_{header_column}".replace("#", ""),
 4749                            )
 4750                            header_info_number = "."
 4751                            header_info_description = (
 4752                                f"Exomiser {header_column} annotation"
 4753                            )
 4754                            header_info_source = "Exomiser"
 4755                            header_info_version = "unknown"
 4756                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4757                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4758                                header_info_name,
 4759                                header_info_number,
 4760                                header_info_type,
 4761                                header_info_description,
 4762                                header_info_source,
 4763                                header_info_version,
 4764                                header_info_code,
 4765                            )
 4766
 4767                            # Add field to add for update to concat fields
 4768                            sql_query_update_concat_fields.append(
 4769                                f"""
 4770                                CASE
 4771                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4772                                    THEN concat(
 4773                                        '{header_info_name}=',
 4774                                        table_parquet."{header_column}",
 4775                                        ';'
 4776                                        )
 4777
 4778                                    ELSE ''
 4779                                END
 4780                            """
 4781                            )
 4782
 4783                    # Update query
 4784                    sql_query_update = f"""
 4785                        UPDATE {table_variants} as table_variants
 4786                            SET INFO = concat(
 4787                                            CASE
 4788                                                WHEN INFO NOT IN ('', '.')
 4789                                                THEN INFO
 4790                                                ELSE ''
 4791                                            END,
 4792                                            CASE
 4793                                                WHEN table_variants.INFO NOT IN ('','.')
 4794                                                THEN ';'
 4795                                                ELSE ''
 4796                                            END,
 4797                                            (
 4798                                            SELECT 
 4799                                                concat(
 4800                                                    {",".join(sql_query_update_concat_fields)}
 4801                                                )
 4802                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4803                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4804                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4805                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4806                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4807                                            )
 4808                                        )
 4809                            ;
 4810                        """
 4811
 4812                    # Update
 4813                    self.conn.execute(sql_query_update)
 4814
 4815                ### Annotate with VCF INFO field ###
 4816
 4817                # Init result VCF file
 4818                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4819
 4820                # If VCF exists
 4821                if os.path.exists(output_results_vcf):
 4822
 4823                    # Log
 4824                    log.debug("Exomiser result VCF update variants")
 4825
 4826                    # Find Exomiser INFO field annotation in header
 4827                    with gzip.open(output_results_vcf, "rt") as f:
 4828                        header_list = self.read_vcf_header(f)
 4829                    exomiser_vcf_header = vcf.Reader(
 4830                        io.StringIO("\n".join(header_list))
 4831                    )
 4832
 4833                    # Add annotation INFO field to header
 4834                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4835
 4836                    # Update variants with VCF
 4837                    self.update_from_vcf(output_results_vcf)
 4838
 4839        return True
 4840
 4841    def annotation_snpeff(self, threads: int = None) -> None:
 4842        """
 4843        This function annotate with snpEff
 4844
 4845        :param threads: The number of threads to use
 4846        :return: the value of the variable "return_value".
 4847        """
 4848
 4849        # DEBUG
 4850        log.debug("Start annotation with snpeff databases")
 4851
 4852        # Threads
 4853        if not threads:
 4854            threads = self.get_threads()
 4855        log.debug("Threads: " + str(threads))
 4856
 4857        # DEBUG
 4858        delete_tmp = True
 4859        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4860            delete_tmp = False
 4861            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4862
 4863        # Config
 4864        config = self.get_config()
 4865        log.debug("Config: " + str(config))
 4866
 4867        # Config - Folders - Databases
 4868        databases_folders = (
 4869            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4870        )
 4871        log.debug("Databases annotations: " + str(databases_folders))
 4872
 4873        # # Config - Java
 4874        # java_bin = get_bin(
 4875        #     tool="java",
 4876        #     bin="java",
 4877        #     bin_type="bin",
 4878        #     config=config,
 4879        #     default_folder="/usr/bin",
 4880        # )
 4881        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4882        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4883        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4884
 4885        # # Config - snpEff bin
 4886        # snpeff_jar = get_bin(
 4887        #     tool="snpeff",
 4888        #     bin="snpEff.jar",
 4889        #     bin_type="jar",
 4890        #     config=config,
 4891        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4892        # )
 4893        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4894        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4895        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4896
 4897        # Config - snpEff bin command
 4898        snpeff_bin_command = get_bin_command(
 4899            bin="snpEff.jar",
 4900            tool="snpeff",
 4901            bin_type="jar",
 4902            config=config,
 4903            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4904        )
 4905        if not snpeff_bin_command:
 4906            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4907            log.error(msg_err)
 4908            raise ValueError(msg_err)
 4909
 4910        # Config - snpEff databases
 4911        snpeff_databases = (
 4912            config.get("folders", {})
 4913            .get("databases", {})
 4914            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4915        )
 4916        snpeff_databases = full_path(snpeff_databases)
 4917        if snpeff_databases is not None and snpeff_databases != "":
 4918            log.debug(f"Create snpEff databases folder")
 4919            if not os.path.exists(snpeff_databases):
 4920                os.makedirs(snpeff_databases)
 4921
 4922        # Param
 4923        param = self.get_param()
 4924        log.debug("Param: " + str(param))
 4925
 4926        # Param
 4927        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4928        log.debug("Options: " + str(options))
 4929
 4930        # Param - Assembly
 4931        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4932
 4933        # Param - Options
 4934        snpeff_options = (
 4935            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4936        )
 4937        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4938        snpeff_csvstats = (
 4939            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4940        )
 4941        if snpeff_stats:
 4942            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4943            snpeff_stats = full_path(snpeff_stats)
 4944            snpeff_options += f" -stats {snpeff_stats}"
 4945        if snpeff_csvstats:
 4946            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4947            snpeff_csvstats = full_path(snpeff_csvstats)
 4948            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4949
 4950        # Data
 4951        table_variants = self.get_table_variants()
 4952
 4953        # Check if not empty
 4954        log.debug("Check if not empty")
 4955        sql_query_chromosomes = (
 4956            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4957        )
 4958        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4959        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4960            log.info(f"VCF empty")
 4961            return
 4962
 4963        # Export in VCF
 4964        log.debug("Create initial file to annotate")
 4965        tmp_vcf = NamedTemporaryFile(
 4966            prefix=self.get_prefix(),
 4967            dir=self.get_tmp_dir(),
 4968            suffix=".vcf.gz",
 4969            delete=True,
 4970        )
 4971        tmp_vcf_name = tmp_vcf.name
 4972
 4973        # VCF header
 4974        vcf_reader = self.get_header()
 4975        log.debug("Initial header: " + str(vcf_reader.infos))
 4976
 4977        # Existing annotations
 4978        for vcf_annotation in self.get_header().infos:
 4979
 4980            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4981            log.debug(
 4982                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4983            )
 4984
 4985        # Memory limit
 4986        # if config.get("memory", None):
 4987        #     memory_limit = config.get("memory", "8G")
 4988        # else:
 4989        #     memory_limit = "8G"
 4990        memory_limit = self.get_memory("8G")
 4991        log.debug(f"memory_limit: {memory_limit}")
 4992
 4993        # snpEff java options
 4994        snpeff_java_options = (
 4995            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4996        )
 4997        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4998
 4999        force_update_annotation = True
 5000
 5001        if "ANN" not in self.get_header().infos or force_update_annotation:
 5002
 5003            # Check snpEff database
 5004            log.debug(f"Check snpEff databases {[assembly]}")
 5005            databases_download_snpeff(
 5006                folder=snpeff_databases, assemblies=[assembly], config=config
 5007            )
 5008
 5009            # Export VCF file
 5010            self.export_variant_vcf(
 5011                vcf_file=tmp_vcf_name,
 5012                remove_info=True,
 5013                add_samples=False,
 5014                index=True,
 5015            )
 5016
 5017            # Tmp file
 5018            err_files = []
 5019            tmp_annotate_vcf = NamedTemporaryFile(
 5020                prefix=self.get_prefix(),
 5021                dir=self.get_tmp_dir(),
 5022                suffix=".vcf",
 5023                delete=False,
 5024            )
 5025            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5026            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5027            err_files.append(tmp_annotate_vcf_name_err)
 5028
 5029            # Command
 5030            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5031            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5032            run_parallel_commands([snpeff_command], 1)
 5033
 5034            # Error messages
 5035            log.info(f"Error/Warning messages:")
 5036            error_message_command_all = []
 5037            error_message_command_warning = []
 5038            error_message_command_err = []
 5039            for err_file in err_files:
 5040                with open(err_file, "r") as f:
 5041                    for line in f:
 5042                        message = line.strip()
 5043                        error_message_command_all.append(message)
 5044                        if line.startswith("[W::"):
 5045                            error_message_command_warning.append(message)
 5046                        if line.startswith("[E::"):
 5047                            error_message_command_err.append(f"{err_file}: " + message)
 5048            # log info
 5049            for message in list(
 5050                set(error_message_command_err + error_message_command_warning)
 5051            ):
 5052                log.info(f"   {message}")
 5053            # debug info
 5054            for message in list(set(error_message_command_all)):
 5055                log.debug(f"   {message}")
 5056            # failed
 5057            if len(error_message_command_err):
 5058                log.error("Annotation failed: Error in commands")
 5059                raise ValueError("Annotation failed: Error in commands")
 5060
 5061            # Find annotation in header
 5062            with open(tmp_annotate_vcf_name, "rt") as f:
 5063                header_list = self.read_vcf_header(f)
 5064            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5065
 5066            for ann in annovar_vcf_header.infos:
 5067                if ann not in self.get_header().infos:
 5068                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5069
 5070            # Update variants
 5071            log.info(f"Annotation - Updating...")
 5072            self.update_from_vcf(tmp_annotate_vcf_name)
 5073
 5074        else:
 5075            if "ANN" in self.get_header().infos:
 5076                log.debug(f"Existing snpEff annotations in VCF")
 5077            if force_update_annotation:
 5078                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5079
 5080    def annotation_annovar(self, threads: int = None) -> None:
 5081        """
 5082        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5083        annotations
 5084
 5085        :param threads: number of threads to use
 5086        :return: the value of the variable "return_value".
 5087        """
 5088
 5089        # DEBUG
 5090        log.debug("Start annotation with Annovar databases")
 5091
 5092        # Threads
 5093        if not threads:
 5094            threads = self.get_threads()
 5095        log.debug("Threads: " + str(threads))
 5096
 5097        # Tmp en Err files
 5098        tmp_files = []
 5099        err_files = []
 5100
 5101        # DEBUG
 5102        delete_tmp = True
 5103        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5104            delete_tmp = False
 5105            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5106
 5107        # Config
 5108        config = self.get_config()
 5109        log.debug("Config: " + str(config))
 5110
 5111        # Config - Folders - Databases
 5112        databases_folders = (
 5113            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5114        )
 5115        log.debug("Databases annotations: " + str(databases_folders))
 5116
 5117        # Config - annovar bin command
 5118        annovar_bin_command = get_bin_command(
 5119            bin="table_annovar.pl",
 5120            tool="annovar",
 5121            bin_type="perl",
 5122            config=config,
 5123            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5124        )
 5125        if not annovar_bin_command:
 5126            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5127            log.error(msg_err)
 5128            raise ValueError(msg_err)
 5129
 5130        # Config - BCFTools bin command
 5131        bcftools_bin_command = get_bin_command(
 5132            bin="bcftools",
 5133            tool="bcftools",
 5134            bin_type="bin",
 5135            config=config,
 5136            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5137        )
 5138        if not bcftools_bin_command:
 5139            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5140            log.error(msg_err)
 5141            raise ValueError(msg_err)
 5142
 5143        # Config - annovar databases
 5144        annovar_databases = (
 5145            config.get("folders", {})
 5146            .get("databases", {})
 5147            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5148        )
 5149        annovar_databases = full_path(annovar_databases)
 5150        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5151            os.makedirs(annovar_databases)
 5152
 5153        # Param
 5154        param = self.get_param()
 5155        log.debug("Param: " + str(param))
 5156
 5157        # Param - options
 5158        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5159        log.debug("Options: " + str(options))
 5160
 5161        # Param - annotations
 5162        annotations = (
 5163            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5164        )
 5165        log.debug("Annotations: " + str(annotations))
 5166
 5167        # Param - Assembly
 5168        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5169
 5170        # Annovar database assembly
 5171        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5172        if annovar_databases_assembly != "" and not os.path.exists(
 5173            annovar_databases_assembly
 5174        ):
 5175            os.makedirs(annovar_databases_assembly)
 5176
 5177        # Data
 5178        table_variants = self.get_table_variants()
 5179
 5180        # Check if not empty
 5181        log.debug("Check if not empty")
 5182        sql_query_chromosomes = (
 5183            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5184        )
 5185        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5186        if not sql_query_chromosomes_df["count"][0]:
 5187            log.info(f"VCF empty")
 5188            return
 5189
 5190        # VCF header
 5191        vcf_reader = self.get_header()
 5192        log.debug("Initial header: " + str(vcf_reader.infos))
 5193
 5194        # Existing annotations
 5195        for vcf_annotation in self.get_header().infos:
 5196
 5197            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5198            log.debug(
 5199                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5200            )
 5201
 5202        force_update_annotation = True
 5203
 5204        if annotations:
 5205
 5206            commands = []
 5207            tmp_annotates_vcf_name_list = []
 5208
 5209            # Export in VCF
 5210            log.debug("Create initial file to annotate")
 5211            tmp_vcf = NamedTemporaryFile(
 5212                prefix=self.get_prefix(),
 5213                dir=self.get_tmp_dir(),
 5214                suffix=".vcf.gz",
 5215                delete=False,
 5216            )
 5217            tmp_vcf_name = tmp_vcf.name
 5218            tmp_files.append(tmp_vcf_name)
 5219            tmp_files.append(tmp_vcf_name + ".tbi")
 5220
 5221            # Export VCF file
 5222            self.export_variant_vcf(
 5223                vcf_file=tmp_vcf_name,
 5224                remove_info=".",
 5225                add_samples=False,
 5226                index=True,
 5227            )
 5228
 5229            # Create file for field rename
 5230            log.debug("Create file for field rename")
 5231            tmp_rename = NamedTemporaryFile(
 5232                prefix=self.get_prefix(),
 5233                dir=self.get_tmp_dir(),
 5234                suffix=".rename",
 5235                delete=False,
 5236            )
 5237            tmp_rename_name = tmp_rename.name
 5238            tmp_files.append(tmp_rename_name)
 5239
 5240            # Check Annovar database
 5241            log.debug(
 5242                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5243            )
 5244            databases_download_annovar(
 5245                folder=annovar_databases,
 5246                files=list(annotations.keys()),
 5247                assemblies=[assembly],
 5248            )
 5249
 5250            for annotation in annotations:
 5251                annotation_fields = annotations[annotation]
 5252
 5253                if not annotation_fields:
 5254                    annotation_fields = {"INFO": None}
 5255
 5256                log.info(f"Annotations Annovar - database '{annotation}'")
 5257                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5258
 5259                # Tmp file for annovar
 5260                err_files = []
 5261                tmp_annotate_vcf_directory = TemporaryDirectory(
 5262                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5263                )
 5264                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5265                tmp_annotate_vcf_name_annovar = (
 5266                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5267                )
 5268                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5269                err_files.append(tmp_annotate_vcf_name_err)
 5270                tmp_files.append(tmp_annotate_vcf_name_err)
 5271
 5272                # Tmp file final vcf annotated by annovar
 5273                tmp_annotate_vcf = NamedTemporaryFile(
 5274                    prefix=self.get_prefix(),
 5275                    dir=self.get_tmp_dir(),
 5276                    suffix=".vcf.gz",
 5277                    delete=False,
 5278                )
 5279                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5280                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5281                tmp_files.append(tmp_annotate_vcf_name)
 5282                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5283
 5284                # Number of fields
 5285                annotation_list = []
 5286                annotation_renamed_list = []
 5287
 5288                for annotation_field in annotation_fields:
 5289
 5290                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5291                    annotation_fields_new_name = annotation_fields.get(
 5292                        annotation_field, annotation_field
 5293                    )
 5294                    if not annotation_fields_new_name:
 5295                        annotation_fields_new_name = annotation_field
 5296
 5297                    if (
 5298                        force_update_annotation
 5299                        or annotation_fields_new_name not in self.get_header().infos
 5300                    ):
 5301                        annotation_list.append(annotation_field)
 5302                        annotation_renamed_list.append(annotation_fields_new_name)
 5303                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5304                        log.warning(
 5305                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5306                        )
 5307
 5308                    # Add rename info
 5309                    run_parallel_commands(
 5310                        [
 5311                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5312                        ],
 5313                        1,
 5314                    )
 5315
 5316                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5317                log.debug("annotation_list: " + str(annotation_list))
 5318
 5319                # protocol
 5320                protocol = annotation
 5321
 5322                # argument
 5323                argument = ""
 5324
 5325                # operation
 5326                operation = "f"
 5327                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5328                    "ensGene"
 5329                ):
 5330                    operation = "g"
 5331                    if options.get("genebase", None):
 5332                        argument = f"""'{options.get("genebase","")}'"""
 5333                elif annotation in ["cytoBand"]:
 5334                    operation = "r"
 5335
 5336                # argument option
 5337                argument_option = ""
 5338                if argument != "":
 5339                    argument_option = " --argument " + argument
 5340
 5341                # command options
 5342                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5343                for option in options:
 5344                    if option not in ["genebase"]:
 5345                        command_options += f""" --{option}={options[option]}"""
 5346
 5347                # Command
 5348
 5349                # Command - Annovar
 5350                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5351                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5352
 5353                # Command - start pipe
 5354                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5355
 5356                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5357                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5358
 5359                # Command - Special characters (refGene annotation)
 5360                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5361
 5362                # Command - Clean empty fields (with value ".")
 5363                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5364
 5365                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5366                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5367                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5368                    # for ann in annotation_renamed_list:
 5369                    for ann in annotation_list:
 5370                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5371
 5372                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5373
 5374                # Command - indexing
 5375                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5376
 5377                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5378                run_parallel_commands([command_annovar], 1)
 5379
 5380                # Error messages
 5381                log.info(f"Error/Warning messages:")
 5382                error_message_command_all = []
 5383                error_message_command_warning = []
 5384                error_message_command_err = []
 5385                for err_file in err_files:
 5386                    with open(err_file, "r") as f:
 5387                        for line in f:
 5388                            message = line.strip()
 5389                            error_message_command_all.append(message)
 5390                            if line.startswith("[W::") or line.startswith("WARNING"):
 5391                                error_message_command_warning.append(message)
 5392                            if line.startswith("[E::") or line.startswith("ERROR"):
 5393                                error_message_command_err.append(
 5394                                    f"{err_file}: " + message
 5395                                )
 5396                # log info
 5397                for message in list(
 5398                    set(error_message_command_err + error_message_command_warning)
 5399                ):
 5400                    log.info(f"   {message}")
 5401                # debug info
 5402                for message in list(set(error_message_command_all)):
 5403                    log.debug(f"   {message}")
 5404                # failed
 5405                if len(error_message_command_err):
 5406                    log.error("Annotation failed: Error in commands")
 5407                    raise ValueError("Annotation failed: Error in commands")
 5408
 5409            if tmp_annotates_vcf_name_list:
 5410
 5411                # List of annotated files
 5412                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5413
 5414                # Tmp file
 5415                tmp_annotate_vcf = NamedTemporaryFile(
 5416                    prefix=self.get_prefix(),
 5417                    dir=self.get_tmp_dir(),
 5418                    suffix=".vcf.gz",
 5419                    delete=False,
 5420                )
 5421                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5422                tmp_files.append(tmp_annotate_vcf_name)
 5423                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5424                err_files.append(tmp_annotate_vcf_name_err)
 5425                tmp_files.append(tmp_annotate_vcf_name_err)
 5426
 5427                # Command merge
 5428                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5429                log.info(
 5430                    f"Annotation Annovar - Annotation merging "
 5431                    + str(len(tmp_annotates_vcf_name_list))
 5432                    + " annotated files"
 5433                )
 5434                log.debug(f"Annotation - merge command: {merge_command}")
 5435                run_parallel_commands([merge_command], 1)
 5436
 5437                # Find annotation in header
 5438                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5439                    header_list = self.read_vcf_header(f)
 5440                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5441
 5442                for ann in annovar_vcf_header.infos:
 5443                    if ann not in self.get_header().infos:
 5444                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5445
 5446                # Update variants
 5447                log.info(f"Annotation Annovar - Updating...")
 5448                self.update_from_vcf(tmp_annotate_vcf_name)
 5449
 5450            # Clean files
 5451            # Tmp file remove command
 5452            if True:
 5453                tmp_files_remove_command = ""
 5454                if tmp_files:
 5455                    tmp_files_remove_command = " ".join(tmp_files)
 5456                clean_command = f" rm -f {tmp_files_remove_command} "
 5457                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5458                log.debug(f"Annotation - cleaning command: {clean_command}")
 5459                run_parallel_commands([clean_command], 1)
 5460
 5461    # Parquet
 5462    def annotation_parquet(self, threads: int = None) -> None:
 5463        """
 5464        It takes a VCF file, and annotates it with a parquet file
 5465
 5466        :param threads: number of threads to use for the annotation
 5467        :return: the value of the variable "result".
 5468        """
 5469
 5470        # DEBUG
 5471        log.debug("Start annotation with parquet databases")
 5472
 5473        # Threads
 5474        if not threads:
 5475            threads = self.get_threads()
 5476        log.debug("Threads: " + str(threads))
 5477
 5478        # DEBUG
 5479        delete_tmp = True
 5480        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5481            delete_tmp = False
 5482            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5483
 5484        # Config
 5485        databases_folders = set(
 5486            self.get_config()
 5487            .get("folders", {})
 5488            .get("databases", {})
 5489            .get("annotations", ["."])
 5490            + self.get_config()
 5491            .get("folders", {})
 5492            .get("databases", {})
 5493            .get("parquet", ["."])
 5494        )
 5495        log.debug("Databases annotations: " + str(databases_folders))
 5496
 5497        # Param
 5498        annotations = (
 5499            self.get_param()
 5500            .get("annotation", {})
 5501            .get("parquet", {})
 5502            .get("annotations", None)
 5503        )
 5504        log.debug("Annotations: " + str(annotations))
 5505
 5506        # Assembly
 5507        assembly = self.get_param().get(
 5508            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5509        )
 5510
 5511        # Force Update Annotation
 5512        force_update_annotation = (
 5513            self.get_param()
 5514            .get("annotation", {})
 5515            .get("options", {})
 5516            .get("annotations_update", False)
 5517        )
 5518        log.debug(f"force_update_annotation={force_update_annotation}")
 5519        force_append_annotation = (
 5520            self.get_param()
 5521            .get("annotation", {})
 5522            .get("options", {})
 5523            .get("annotations_append", False)
 5524        )
 5525        log.debug(f"force_append_annotation={force_append_annotation}")
 5526
 5527        # Data
 5528        table_variants = self.get_table_variants()
 5529
 5530        # Check if not empty
 5531        log.debug("Check if not empty")
 5532        sql_query_chromosomes_df = self.get_query_to_df(
 5533            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5534        )
 5535        if not sql_query_chromosomes_df["count"][0]:
 5536            log.info(f"VCF empty")
 5537            return
 5538
 5539        # VCF header
 5540        vcf_reader = self.get_header()
 5541        log.debug("Initial header: " + str(vcf_reader.infos))
 5542
 5543        # Nb Variants POS
 5544        log.debug("NB Variants Start")
 5545        nb_variants = self.conn.execute(
 5546            f"SELECT count(*) AS count FROM variants"
 5547        ).fetchdf()["count"][0]
 5548        log.debug("NB Variants Stop")
 5549
 5550        # Existing annotations
 5551        for vcf_annotation in self.get_header().infos:
 5552
 5553            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5554            log.debug(
 5555                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5556            )
 5557
 5558        # Added columns
 5559        added_columns = []
 5560
 5561        # drop indexes
 5562        log.debug(f"Drop indexes...")
 5563        self.drop_indexes()
 5564
 5565        if annotations:
 5566
 5567            if "ALL" in annotations:
 5568
 5569                all_param = annotations.get("ALL", {})
 5570                all_param_formats = all_param.get("formats", None)
 5571                all_param_releases = all_param.get("releases", None)
 5572
 5573                databases_infos_dict = self.scan_databases(
 5574                    database_formats=all_param_formats,
 5575                    database_releases=all_param_releases,
 5576                )
 5577                for database_infos in databases_infos_dict.keys():
 5578                    if database_infos not in annotations:
 5579                        annotations[database_infos] = {"INFO": None}
 5580
 5581            for annotation in annotations:
 5582
 5583                if annotation in ["ALL"]:
 5584                    continue
 5585
 5586                # Annotation Name
 5587                annotation_name = os.path.basename(annotation)
 5588
 5589                # Annotation fields
 5590                annotation_fields = annotations[annotation]
 5591                if not annotation_fields:
 5592                    annotation_fields = {"INFO": None}
 5593
 5594                log.debug(f"Annotation '{annotation_name}'")
 5595                log.debug(
 5596                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5597                )
 5598
 5599                # Create Database
 5600                database = Database(
 5601                    database=annotation,
 5602                    databases_folders=databases_folders,
 5603                    assembly=assembly,
 5604                )
 5605
 5606                # Find files
 5607                parquet_file = database.get_database()
 5608                parquet_hdr_file = database.get_header_file()
 5609                parquet_type = database.get_type()
 5610
 5611                # Check if files exists
 5612                if not parquet_file or not parquet_hdr_file:
 5613                    log.error("Annotation failed: file not found")
 5614                    raise ValueError("Annotation failed: file not found")
 5615                else:
 5616                    # Get parquet connexion
 5617                    parquet_sql_attach = database.get_sql_database_attach(
 5618                        output="query"
 5619                    )
 5620                    if parquet_sql_attach:
 5621                        self.conn.execute(parquet_sql_attach)
 5622                    parquet_file_link = database.get_sql_database_link()
 5623                    # Log
 5624                    log.debug(
 5625                        f"Annotation '{annotation_name}' - file: "
 5626                        + str(parquet_file)
 5627                        + " and "
 5628                        + str(parquet_hdr_file)
 5629                    )
 5630
 5631                    # Database full header columns
 5632                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5633                        parquet_hdr_file
 5634                    )
 5635                    # Log
 5636                    log.debug(
 5637                        "Annotation database header columns : "
 5638                        + str(parquet_hdr_vcf_header_columns)
 5639                    )
 5640
 5641                    # Load header as VCF object
 5642                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5643                    # Log
 5644                    log.debug(
 5645                        "Annotation database header: "
 5646                        + str(parquet_hdr_vcf_header_infos)
 5647                    )
 5648
 5649                    # Get extra infos
 5650                    parquet_columns = database.get_extra_columns()
 5651                    # Log
 5652                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5653
 5654                    # Add extra columns if "ALL" in annotation_fields
 5655                    # if "ALL" in annotation_fields:
 5656                    #     allow_add_extra_column = True
 5657                    if "ALL" in annotation_fields and database.get_extra_columns():
 5658                        for extra_column in database.get_extra_columns():
 5659                            if (
 5660                                extra_column not in annotation_fields
 5661                                and extra_column.replace("INFO/", "")
 5662                                not in parquet_hdr_vcf_header_infos
 5663                            ):
 5664                                parquet_hdr_vcf_header_infos[extra_column] = (
 5665                                    vcf.parser._Info(
 5666                                        extra_column,
 5667                                        ".",
 5668                                        "String",
 5669                                        f"{extra_column} description",
 5670                                        "unknown",
 5671                                        "unknown",
 5672                                        self.code_type_map["String"],
 5673                                    )
 5674                                )
 5675
 5676                    # For all fields in database
 5677                    annotation_fields_all = False
 5678                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5679                        annotation_fields_all = True
 5680                        annotation_fields = {
 5681                            key: key for key in parquet_hdr_vcf_header_infos
 5682                        }
 5683
 5684                        log.debug(
 5685                            "Annotation database header - All annotations added: "
 5686                            + str(annotation_fields)
 5687                        )
 5688
 5689                    # Init
 5690
 5691                    # List of annotation fields to use
 5692                    sql_query_annotation_update_info_sets = []
 5693
 5694                    # List of annotation to agregate
 5695                    sql_query_annotation_to_agregate = []
 5696
 5697                    # Number of fields
 5698                    nb_annotation_field = 0
 5699
 5700                    # Annotation fields processed
 5701                    annotation_fields_processed = []
 5702
 5703                    # Columns mapping
 5704                    map_columns = database.map_columns(
 5705                        columns=annotation_fields, prefixes=["INFO/"]
 5706                    )
 5707
 5708                    # Query dict for fields to remove (update option)
 5709                    query_dict_remove = {}
 5710
 5711                    # Fetch Anotation fields
 5712                    for annotation_field in annotation_fields:
 5713
 5714                        # annotation_field_column
 5715                        annotation_field_column = map_columns.get(
 5716                            annotation_field, "INFO"
 5717                        )
 5718
 5719                        # field new name, if parametered
 5720                        annotation_fields_new_name = annotation_fields.get(
 5721                            annotation_field, annotation_field
 5722                        )
 5723                        if not annotation_fields_new_name:
 5724                            annotation_fields_new_name = annotation_field
 5725
 5726                        # To annotate
 5727                        # force_update_annotation = True
 5728                        # force_append_annotation = True
 5729                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5730                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5731                            force_update_annotation
 5732                            or force_append_annotation
 5733                            or (
 5734                                annotation_fields_new_name
 5735                                not in self.get_header().infos
 5736                            )
 5737                        ):
 5738
 5739                            # Add field to annotation to process list
 5740                            annotation_fields_processed.append(
 5741                                annotation_fields_new_name
 5742                            )
 5743
 5744                            # explode infos for the field
 5745                            annotation_fields_new_name_info_msg = ""
 5746                            if (
 5747                                force_update_annotation
 5748                                and annotation_fields_new_name
 5749                                in self.get_header().infos
 5750                            ):
 5751                                # Remove field from INFO
 5752                                query = f"""
 5753                                    UPDATE {table_variants} as table_variants
 5754                                    SET INFO = REGEXP_REPLACE(
 5755                                                concat(table_variants.INFO,''),
 5756                                                ';*{annotation_fields_new_name}=[^;]*',
 5757                                                ''
 5758                                                )
 5759                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5760                                """
 5761                                annotation_fields_new_name_info_msg = " [update]"
 5762                                query_dict_remove[
 5763                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5764                                ] = query
 5765
 5766                            # Sep between fields in INFO
 5767                            nb_annotation_field += 1
 5768                            if nb_annotation_field > 1:
 5769                                annotation_field_sep = ";"
 5770                            else:
 5771                                annotation_field_sep = ""
 5772
 5773                            log.info(
 5774                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5775                            )
 5776
 5777                            # Add INFO field to header
 5778                            parquet_hdr_vcf_header_infos_number = (
 5779                                parquet_hdr_vcf_header_infos[annotation_field].num
 5780                                or "."
 5781                            )
 5782                            parquet_hdr_vcf_header_infos_type = (
 5783                                parquet_hdr_vcf_header_infos[annotation_field].type
 5784                                or "String"
 5785                            )
 5786                            parquet_hdr_vcf_header_infos_description = (
 5787                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5788                                or f"{annotation_field} description"
 5789                            )
 5790                            parquet_hdr_vcf_header_infos_source = (
 5791                                parquet_hdr_vcf_header_infos[annotation_field].source
 5792                                or "unknown"
 5793                            )
 5794                            parquet_hdr_vcf_header_infos_version = (
 5795                                parquet_hdr_vcf_header_infos[annotation_field].version
 5796                                or "unknown"
 5797                            )
 5798
 5799                            vcf_reader.infos[annotation_fields_new_name] = (
 5800                                vcf.parser._Info(
 5801                                    annotation_fields_new_name,
 5802                                    parquet_hdr_vcf_header_infos_number,
 5803                                    parquet_hdr_vcf_header_infos_type,
 5804                                    parquet_hdr_vcf_header_infos_description,
 5805                                    parquet_hdr_vcf_header_infos_source,
 5806                                    parquet_hdr_vcf_header_infos_version,
 5807                                    self.code_type_map[
 5808                                        parquet_hdr_vcf_header_infos_type
 5809                                    ],
 5810                                )
 5811                            )
 5812
 5813                            # Append
 5814                            if force_append_annotation:
 5815                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5816                            else:
 5817                                query_case_when_append = ""
 5818
 5819                            # Annotation/Update query fields
 5820                            # Found in INFO column
 5821                            if (
 5822                                annotation_field_column == "INFO"
 5823                                and "INFO" in parquet_hdr_vcf_header_columns
 5824                            ):
 5825                                sql_query_annotation_update_info_sets.append(
 5826                                    f"""
 5827                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5828                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5829                                        ELSE ''
 5830                                    END
 5831                                """
 5832                                )
 5833                            # Found in a specific column
 5834                            else:
 5835                                # sql_query_annotation_update_info_sets.append(
 5836                                #     f"""
 5837                                # CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5838                                #         THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
 5839                                #         ELSE ''
 5840                                #     END
 5841                                # """
 5842                                # )
 5843                                sql_query_annotation_update_info_sets.append(
 5844                                    f"""
 5845                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5846                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 5847                                        ELSE ''
 5848                                    END
 5849                                """
 5850                                )
 5851                                sql_query_annotation_to_agregate.append(
 5852                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5853                                )
 5854
 5855                        # Not to annotate
 5856                        else:
 5857
 5858                            if force_update_annotation:
 5859                                annotation_message = "forced"
 5860                            else:
 5861                                annotation_message = "skipped"
 5862
 5863                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5864                                log.warning(
 5865                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5866                                )
 5867                            if annotation_fields_new_name in self.get_header().infos:
 5868                                log.warning(
 5869                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5870                                )
 5871
 5872                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5873                    # allow_annotation_full_info = True
 5874                    allow_annotation_full_info = not force_append_annotation
 5875
 5876                    if parquet_type in ["regions"]:
 5877                        allow_annotation_full_info = False
 5878
 5879                    if (
 5880                        allow_annotation_full_info
 5881                        and nb_annotation_field == len(annotation_fields)
 5882                        and annotation_fields_all
 5883                        and (
 5884                            "INFO" in parquet_hdr_vcf_header_columns
 5885                            and "INFO" in database.get_extra_columns()
 5886                        )
 5887                    ):
 5888                        log.debug("Column INFO annotation enabled")
 5889                        sql_query_annotation_update_info_sets = []
 5890                        sql_query_annotation_update_info_sets.append(
 5891                            f" table_parquet.INFO "
 5892                        )
 5893
 5894                    if sql_query_annotation_update_info_sets:
 5895
 5896                        # Annotate
 5897                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5898
 5899                        # Join query annotation update info sets for SQL
 5900                        sql_query_annotation_update_info_sets_sql = ",".join(
 5901                            sql_query_annotation_update_info_sets
 5902                        )
 5903
 5904                        # Check chromosomes list (and variants infos)
 5905                        sql_query_chromosomes = f"""
 5906                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5907                            FROM {table_variants} as table_variants
 5908                            GROUP BY table_variants."#CHROM"
 5909                            ORDER BY table_variants."#CHROM"
 5910                            """
 5911                        sql_query_chromosomes_df = self.conn.execute(
 5912                            sql_query_chromosomes
 5913                        ).df()
 5914                        sql_query_chromosomes_dict = {
 5915                            entry["CHROM"]: {
 5916                                "count": entry["count_variants"],
 5917                                "min": entry["min_variants"],
 5918                                "max": entry["max_variants"],
 5919                            }
 5920                            for index, entry in sql_query_chromosomes_df.iterrows()
 5921                        }
 5922
 5923                        # Init
 5924                        nb_of_query = 0
 5925                        nb_of_variant_annotated = 0
 5926                        query_dict = query_dict_remove
 5927
 5928                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5929                        for chrom in sql_query_chromosomes_dict:
 5930
 5931                            # Number of variant by chromosome
 5932                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5933                                chrom, {}
 5934                            ).get("count", 0)
 5935
 5936                            log.debug(
 5937                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5938                            )
 5939
 5940                            # Annotation with regions database
 5941                            if parquet_type in ["regions"]:
 5942                                sql_query_annotation_from_clause = f"""
 5943                                    FROM (
 5944                                        SELECT 
 5945                                            '{chrom}' AS \"#CHROM\",
 5946                                            table_variants_from.\"POS\" AS \"POS\",
 5947                                            {",".join(sql_query_annotation_to_agregate)}
 5948                                        FROM {table_variants} as table_variants_from
 5949                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5950                                            table_parquet_from."#CHROM" = '{chrom}'
 5951                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5952                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5953                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5954                                                )
 5955                                        )
 5956                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5957                                        GROUP BY table_variants_from.\"POS\"
 5958                                        )
 5959                                        as table_parquet
 5960                                """
 5961
 5962                                sql_query_annotation_where_clause = """
 5963                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5964                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5965                                """
 5966
 5967                            # Annotation with variants database
 5968                            else:
 5969                                sql_query_annotation_from_clause = f"""
 5970                                    FROM {parquet_file_link} as table_parquet
 5971                                """
 5972                                sql_query_annotation_where_clause = f"""
 5973                                    table_variants."#CHROM" = '{chrom}'
 5974                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5975                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5976                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5977                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5978                                """
 5979
 5980                            # Create update query
 5981                            sql_query_annotation_chrom_interval_pos = f"""
 5982                                UPDATE {table_variants} as table_variants
 5983                                    SET INFO = 
 5984                                        concat(
 5985                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5986                                                THEN table_variants.INFO
 5987                                                ELSE ''
 5988                                            END
 5989                                            ,
 5990                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5991                                                        AND (
 5992                                                        concat({sql_query_annotation_update_info_sets_sql})
 5993                                                        )
 5994                                                        NOT IN ('','.') 
 5995                                                    THEN ';'
 5996                                                    ELSE ''
 5997                                            END
 5998                                            ,
 5999                                            {sql_query_annotation_update_info_sets_sql}
 6000                                            )
 6001                                    {sql_query_annotation_from_clause}
 6002                                    WHERE {sql_query_annotation_where_clause}
 6003                                    ;
 6004                                """
 6005
 6006                            # Add update query to dict
 6007                            query_dict[
 6008                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6009                            ] = sql_query_annotation_chrom_interval_pos
 6010
 6011                        nb_of_query = len(query_dict)
 6012                        num_query = 0
 6013
 6014                        # SET max_expression_depth TO x
 6015                        self.conn.execute("SET max_expression_depth TO 10000")
 6016
 6017                        for query_name in query_dict:
 6018                            query = query_dict[query_name]
 6019                            num_query += 1
 6020                            log.info(
 6021                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6022                            )
 6023                            result = self.conn.execute(query)
 6024                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6025                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6026                            log.info(
 6027                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6028                            )
 6029
 6030                        log.info(
 6031                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6032                        )
 6033
 6034                    else:
 6035
 6036                        log.info(
 6037                            f"Annotation '{annotation_name}' - No Annotations available"
 6038                        )
 6039
 6040                    log.debug("Final header: " + str(vcf_reader.infos))
 6041
 6042        # Remove added columns
 6043        for added_column in added_columns:
 6044            self.drop_column(column=added_column)
 6045
 6046    def annotation_splice(self, threads: int = None) -> None:
 6047        """
 6048        This function annotate with snpEff
 6049
 6050        :param threads: The number of threads to use
 6051        :return: the value of the variable "return_value".
 6052        """
 6053
 6054        # DEBUG
 6055        log.debug("Start annotation with splice tools")
 6056
 6057        # Threads
 6058        if not threads:
 6059            threads = self.get_threads()
 6060        log.debug("Threads: " + str(threads))
 6061
 6062        # DEBUG
 6063        delete_tmp = True
 6064        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6065            delete_tmp = False
 6066            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6067
 6068        # Config
 6069        config = self.get_config()
 6070        log.debug("Config: " + str(config))
 6071        splice_config = config.get("tools", {}).get("splice", {})
 6072        if not splice_config:
 6073            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6074        if not splice_config:
 6075            msg_err = "No Splice tool config"
 6076            log.error(msg_err)
 6077            raise ValueError(msg_err)
 6078        log.debug(f"splice_config={splice_config}")
 6079
 6080        # Config - Folders - Databases
 6081        databases_folders = (
 6082            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6083        )
 6084        log.debug("Databases annotations: " + str(databases_folders))
 6085
 6086        # Splice docker image
 6087        splice_docker_image = splice_config.get("docker").get("image")
 6088
 6089        # Pull splice image if it's not already there
 6090        if not check_docker_image_exists(splice_docker_image):
 6091            log.warning(
 6092                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6093            )
 6094            try:
 6095                command(f"docker pull {splice_config.get('docker').get('image')}")
 6096            except subprocess.CalledProcessError:
 6097                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6098                log.error(msg_err)
 6099                raise ValueError(msg_err)
 6100                return None
 6101
 6102        # Config - splice databases
 6103        splice_databases = (
 6104            config.get("folders", {})
 6105            .get("databases", {})
 6106            .get("splice", DEFAULT_SPLICE_FOLDER)
 6107        )
 6108        splice_databases = full_path(splice_databases)
 6109
 6110        # Param
 6111        param = self.get_param()
 6112        log.debug("Param: " + str(param))
 6113
 6114        # Param
 6115        options = param.get("annotation", {}).get("splice", {})
 6116        log.debug("Options: " + str(options))
 6117
 6118        # Data
 6119        table_variants = self.get_table_variants()
 6120
 6121        # Check if not empty
 6122        log.debug("Check if not empty")
 6123        sql_query_chromosomes = (
 6124            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6125        )
 6126        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6127            log.info("VCF empty")
 6128            return None
 6129
 6130        # Export in VCF
 6131        log.debug("Create initial file to annotate")
 6132
 6133        # Create output folder
 6134        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6135        if not os.path.exists(output_folder):
 6136            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6137
 6138        # Create tmp VCF file
 6139        tmp_vcf = NamedTemporaryFile(
 6140            prefix=self.get_prefix(),
 6141            dir=output_folder,
 6142            suffix=".vcf",
 6143            delete=False,
 6144        )
 6145        tmp_vcf_name = tmp_vcf.name
 6146
 6147        # VCF header
 6148        header = self.get_header()
 6149
 6150        # Existing annotations
 6151        for vcf_annotation in self.get_header().infos:
 6152
 6153            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6154            log.debug(
 6155                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6156            )
 6157
 6158        # Memory limit
 6159        if config.get("memory", None):
 6160            memory_limit = config.get("memory", "8G").upper()
 6161            # upper()
 6162        else:
 6163            memory_limit = "8G"
 6164        log.debug(f"memory_limit: {memory_limit}")
 6165
 6166        # Check number of variants to annotate
 6167        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6168        where_clause_regex_spip = r"SPiP_\w+"
 6169        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6170        df_list_of_variants_to_annotate = self.get_query_to_df(
 6171            query=f""" SELECT * FROM variants {where_clause} """
 6172        )
 6173        if len(df_list_of_variants_to_annotate) == 0:
 6174            log.warning(
 6175                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6176            )
 6177            return None
 6178        else:
 6179            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6180
 6181        # Export VCF file
 6182        self.export_variant_vcf(
 6183            vcf_file=tmp_vcf_name,
 6184            remove_info=True,
 6185            add_samples=True,
 6186            index=False,
 6187            where_clause=where_clause,
 6188        )
 6189
 6190        # Create docker container and launch splice analysis
 6191        if splice_config:
 6192
 6193            # Splice mount folders
 6194            mount_folders = splice_config.get("mount", {})
 6195
 6196            # Genome mount
 6197            mount_folders[
 6198                config.get("folders", {})
 6199                .get("databases", {})
 6200                .get("genomes", DEFAULT_GENOME_FOLDER)
 6201            ] = "ro"
 6202
 6203            # SpliceAI mount
 6204            mount_folders[
 6205                config.get("folders", {})
 6206                .get("databases", {})
 6207                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6208            ] = "ro"
 6209
 6210            # Genome mount
 6211            mount_folders[
 6212                config.get("folders", {})
 6213                .get("databases", {})
 6214                .get("spip", DEFAULT_SPIP_FOLDER)
 6215            ] = "ro"
 6216
 6217            # Mount folders
 6218            mount = []
 6219
 6220            # Config mount
 6221            mount = [
 6222                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6223                for path, mode in mount_folders.items()
 6224            ]
 6225
 6226            if any(value for value in splice_config.values() if value is None):
 6227                log.warning("At least one splice config parameter is empty")
 6228                return None
 6229
 6230            # Params in splice nf
 6231            def check_values(dico: dict):
 6232                """
 6233                Ensure parameters for NF splice pipeline
 6234                """
 6235                for key, val in dico.items():
 6236                    if key == "genome":
 6237                        if any(
 6238                            assemb in options.get("genome", {})
 6239                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6240                        ):
 6241                            yield f"--{key} hg19"
 6242                        elif any(
 6243                            assemb in options.get("genome", {})
 6244                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6245                        ):
 6246                            yield f"--{key} hg38"
 6247                    elif (
 6248                        (isinstance(val, str) and val)
 6249                        or isinstance(val, int)
 6250                        or isinstance(val, bool)
 6251                    ):
 6252                        yield f"--{key} {val}"
 6253
 6254            # Genome
 6255            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6256            options["genome"] = genome
 6257
 6258            # NF params
 6259            nf_params = []
 6260
 6261            # Add options
 6262            if options:
 6263                nf_params = list(check_values(options))
 6264                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6265            else:
 6266                log.debug("No NF params provided")
 6267
 6268            # Add threads
 6269            if "threads" not in options.keys():
 6270                nf_params.append(f"--threads {threads}")
 6271
 6272            # Genome path
 6273            genome_path = find_genome(
 6274                config.get("folders", {})
 6275                .get("databases", {})
 6276                .get("genomes", DEFAULT_GENOME_FOLDER),
 6277                file=f"{genome}.fa",
 6278            )
 6279            # Add genome path
 6280            if not genome_path:
 6281                raise ValueError(
 6282                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6283                )
 6284            else:
 6285                log.debug(f"Genome: {genome_path}")
 6286                nf_params.append(f"--genome_path {genome_path}")
 6287
 6288            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6289                """
 6290                Setting up updated databases for SPiP and SpliceAI
 6291                """
 6292
 6293                try:
 6294
 6295                    # SpliceAI assembly transcriptome
 6296                    spliceai_assembly = os.path.join(
 6297                        config.get("folders", {})
 6298                        .get("databases", {})
 6299                        .get("spliceai", {}),
 6300                        options.get("genome"),
 6301                        "transcriptome",
 6302                    )
 6303                    spip_assembly = options.get("genome")
 6304
 6305                    spip = find(
 6306                        f"transcriptome_{spip_assembly}.RData",
 6307                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6308                    )
 6309                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6310                    log.debug(f"SPiP annotations: {spip}")
 6311                    log.debug(f"SpliceAI annotations: {spliceai}")
 6312                    if spip and spliceai:
 6313                        return [
 6314                            f"--spip_transcriptome {spip}",
 6315                            f"--spliceai_annotations {spliceai}",
 6316                        ]
 6317                    else:
 6318                        # TODO crash and go on with basic annotations ?
 6319                        # raise ValueError(
 6320                        #     "Can't find splice databases in configuration EXIT"
 6321                        # )
 6322                        log.warning(
 6323                            "Can't find splice databases in configuration, use annotations file from image"
 6324                        )
 6325                except TypeError:
 6326                    log.warning(
 6327                        "Can't find splice databases in configuration, use annotations file from image"
 6328                    )
 6329                    return []
 6330
 6331            # Add options, check if transcriptome option have already beend provided
 6332            if (
 6333                "spip_transcriptome" not in nf_params
 6334                and "spliceai_transcriptome" not in nf_params
 6335            ):
 6336                splice_reference = splice_annotations(options, config)
 6337                if splice_reference:
 6338                    nf_params.extend(splice_reference)
 6339
 6340            nf_params.append(f"--output_folder {output_folder}")
 6341
 6342            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6343            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6344            log.debug(cmd)
 6345
 6346            splice_config["docker"]["command"] = cmd
 6347
 6348            docker_cmd = get_bin_command(
 6349                tool="splice",
 6350                bin_type="docker",
 6351                config=config,
 6352                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6353                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6354            )
 6355
 6356            # Docker debug
 6357            # if splice_config.get("rm_container"):
 6358            #     rm_container = "--rm"
 6359            # else:
 6360            #     rm_container = ""
 6361            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6362
 6363            log.debug(docker_cmd)
 6364            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6365            log.debug(res.stdout)
 6366            if res.stderr:
 6367                log.error(res.stderr)
 6368            res.check_returncode()
 6369        else:
 6370            log.warning(f"Splice tool configuration not found: {config}")
 6371
 6372        # Update variants
 6373        log.info("Annotation - Updating...")
 6374        # Test find output vcf
 6375        log.debug(
 6376            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6377        )
 6378        output_vcf = []
 6379        # Wrong folder to look in
 6380        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6381            if (
 6382                files
 6383                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6384            ):
 6385                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6386        # log.debug(os.listdir(options.get("output_folder")))
 6387        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6388        if not output_vcf:
 6389            log.debug(
 6390                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6391            )
 6392        else:
 6393            # Get new header from annotated vcf
 6394            log.debug(f"Initial header: {len(header.infos)} fields")
 6395            # Create new header with splice infos
 6396            new_vcf = Variants(input=output_vcf[0])
 6397            new_vcf_header = new_vcf.get_header().infos
 6398            for keys, infos in new_vcf_header.items():
 6399                if keys not in header.infos.keys():
 6400                    header.infos[keys] = infos
 6401            log.debug(f"New header: {len(header.infos)} fields")
 6402            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6403            self.update_from_vcf(output_vcf[0])
 6404
 6405        # Remove folder
 6406        remove_if_exists(output_folder)
 6407
 6408    ###
 6409    # Prioritization
 6410    ###
 6411
 6412    def get_config_default(self, name: str) -> dict:
 6413        """
 6414        The function `get_config_default` returns a dictionary containing default configurations for
 6415        various calculations and prioritizations.
 6416
 6417        :param name: The `get_config_default` function returns a dictionary containing default
 6418        configurations for different calculations and prioritizations. The `name` parameter is used to
 6419        specify which specific configuration to retrieve from the dictionary
 6420        :type name: str
 6421        :return: The function `get_config_default` returns a dictionary containing default configuration
 6422        settings for different calculations and prioritizations. The specific configuration settings are
 6423        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6424        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6425        returned. If there is no match, an empty dictionary is returned.
 6426        """
 6427
 6428        config_default = {
 6429            "calculations": {
 6430                "variant_chr_pos_alt_ref": {
 6431                    "type": "sql",
 6432                    "name": "variant_chr_pos_alt_ref",
 6433                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6434                    "available": False,
 6435                    "output_column_name": "variant_chr_pos_alt_ref",
 6436                    "output_column_type": "String",
 6437                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6438                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6439                    "operation_info": True,
 6440                },
 6441                "VARTYPE": {
 6442                    "type": "sql",
 6443                    "name": "VARTYPE",
 6444                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6445                    "available": True,
 6446                    "output_column_name": "VARTYPE",
 6447                    "output_column_type": "String",
 6448                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6449                    "operation_query": """
 6450                            CASE
 6451                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6452                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6453                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6454                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6455                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6456                                ELSE 'UNDEFINED'
 6457                            END
 6458                            """,
 6459                    "info_fields": ["SVTYPE"],
 6460                    "operation_info": True,
 6461                },
 6462                "snpeff_hgvs": {
 6463                    "type": "python",
 6464                    "name": "snpeff_hgvs",
 6465                    "description": "HGVS nomenclatures from snpEff annotation",
 6466                    "available": True,
 6467                    "function_name": "calculation_extract_snpeff_hgvs",
 6468                    "function_params": ["snpeff_hgvs", "ANN"],
 6469                },
 6470                "snpeff_ann_explode": {
 6471                    "type": "python",
 6472                    "name": "snpeff_ann_explode",
 6473                    "description": "Explode snpEff annotations with uniquify values",
 6474                    "available": True,
 6475                    "function_name": "calculation_snpeff_ann_explode",
 6476                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6477                },
 6478                "snpeff_ann_explode_uniquify": {
 6479                    "type": "python",
 6480                    "name": "snpeff_ann_explode_uniquify",
 6481                    "description": "Explode snpEff annotations",
 6482                    "available": True,
 6483                    "function_name": "calculation_snpeff_ann_explode",
 6484                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6485                },
 6486                "snpeff_ann_explode_json": {
 6487                    "type": "python",
 6488                    "name": "snpeff_ann_explode_json",
 6489                    "description": "Explode snpEff annotations in JSON format",
 6490                    "available": True,
 6491                    "function_name": "calculation_snpeff_ann_explode",
 6492                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6493                },
 6494                "NOMEN": {
 6495                    "type": "python",
 6496                    "name": "NOMEN",
 6497                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6498                    "available": True,
 6499                    "function_name": "calculation_extract_nomen",
 6500                    "function_params": [],
 6501                },
 6502                "FINDBYPIPELINE": {
 6503                    "type": "python",
 6504                    "name": "FINDBYPIPELINE",
 6505                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6506                    "available": True,
 6507                    "function_name": "calculation_find_by_pipeline",
 6508                    "function_params": ["findbypipeline"],
 6509                },
 6510                "FINDBYSAMPLE": {
 6511                    "type": "python",
 6512                    "name": "FINDBYSAMPLE",
 6513                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6514                    "available": True,
 6515                    "function_name": "calculation_find_by_pipeline",
 6516                    "function_params": ["findbysample"],
 6517                },
 6518                "GENOTYPECONCORDANCE": {
 6519                    "type": "python",
 6520                    "name": "GENOTYPECONCORDANCE",
 6521                    "description": "Concordance of genotype for multi caller VCF",
 6522                    "available": True,
 6523                    "function_name": "calculation_genotype_concordance",
 6524                    "function_params": [],
 6525                },
 6526                "BARCODE": {
 6527                    "type": "python",
 6528                    "name": "BARCODE",
 6529                    "description": "BARCODE as VaRank tool",
 6530                    "available": True,
 6531                    "function_name": "calculation_barcode",
 6532                    "function_params": [],
 6533                },
 6534                "BARCODEFAMILY": {
 6535                    "type": "python",
 6536                    "name": "BARCODEFAMILY",
 6537                    "description": "BARCODEFAMILY as VaRank tool",
 6538                    "available": True,
 6539                    "function_name": "calculation_barcode_family",
 6540                    "function_params": ["BCF"],
 6541                },
 6542                "TRIO": {
 6543                    "type": "python",
 6544                    "name": "TRIO",
 6545                    "description": "Inheritance for a trio family",
 6546                    "available": True,
 6547                    "function_name": "calculation_trio",
 6548                    "function_params": [],
 6549                },
 6550                "VAF": {
 6551                    "type": "python",
 6552                    "name": "VAF",
 6553                    "description": "Variant Allele Frequency (VAF) harmonization",
 6554                    "available": True,
 6555                    "function_name": "calculation_vaf_normalization",
 6556                    "function_params": [],
 6557                },
 6558                "VAF_stats": {
 6559                    "type": "python",
 6560                    "name": "VAF_stats",
 6561                    "description": "Variant Allele Frequency (VAF) statistics",
 6562                    "available": True,
 6563                    "function_name": "calculation_genotype_stats",
 6564                    "function_params": ["VAF"],
 6565                },
 6566                "DP_stats": {
 6567                    "type": "python",
 6568                    "name": "DP_stats",
 6569                    "description": "Depth (DP) statistics",
 6570                    "available": True,
 6571                    "function_name": "calculation_genotype_stats",
 6572                    "function_params": ["DP"],
 6573                },
 6574                "variant_id": {
 6575                    "type": "python",
 6576                    "name": "variant_id",
 6577                    "description": "Variant ID generated from variant position and type",
 6578                    "available": True,
 6579                    "function_name": "calculation_variant_id",
 6580                    "function_params": [],
 6581                },
 6582                "transcripts_json": {
 6583                    "type": "python",
 6584                    "name": "transcripts_json",
 6585                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6586                    "available": True,
 6587                    "function_name": "calculation_transcripts_annotation",
 6588                    "function_params": ["transcripts_json", None],
 6589                },
 6590                "transcripts_ann": {
 6591                    "type": "python",
 6592                    "name": "transcripts_ann",
 6593                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6594                    "available": True,
 6595                    "function_name": "calculation_transcripts_annotation",
 6596                    "function_params": [None, "transcripts_ann"],
 6597                },
 6598                "transcripts_annotations": {
 6599                    "type": "python",
 6600                    "name": "transcripts_annotations",
 6601                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6602                    "available": True,
 6603                    "function_name": "calculation_transcripts_annotation",
 6604                    "function_params": [None, None],
 6605                },
 6606                "transcripts_prioritization": {
 6607                    "type": "python",
 6608                    "name": "transcripts_prioritization",
 6609                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6610                    "available": True,
 6611                    "function_name": "calculation_transcripts_prioritization",
 6612                    "function_params": [],
 6613                },
 6614            },
 6615            "prioritizations": {
 6616                "default": {
 6617                    "filter": [
 6618                        {
 6619                            "type": "notequals",
 6620                            "value": "!PASS|\\.",
 6621                            "score": 0,
 6622                            "flag": "FILTERED",
 6623                            "comment": ["Bad variant quality"],
 6624                        },
 6625                        {
 6626                            "type": "equals",
 6627                            "value": "REJECT",
 6628                            "score": -20,
 6629                            "flag": "PASS",
 6630                            "comment": ["Bad variant quality"],
 6631                        },
 6632                    ],
 6633                    "DP": [
 6634                        {
 6635                            "type": "gte",
 6636                            "value": "50",
 6637                            "score": 5,
 6638                            "flag": "PASS",
 6639                            "comment": ["DP higher than 50"],
 6640                        }
 6641                    ],
 6642                    "ANN": [
 6643                        {
 6644                            "type": "contains",
 6645                            "value": "HIGH",
 6646                            "score": 5,
 6647                            "flag": "PASS",
 6648                            "comment": [
 6649                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6650                            ],
 6651                        },
 6652                        {
 6653                            "type": "contains",
 6654                            "value": "MODERATE",
 6655                            "score": 3,
 6656                            "flag": "PASS",
 6657                            "comment": [
 6658                                "A non-disruptive variant that might change protein effectiveness"
 6659                            ],
 6660                        },
 6661                        {
 6662                            "type": "contains",
 6663                            "value": "LOW",
 6664                            "score": 0,
 6665                            "flag": "FILTERED",
 6666                            "comment": [
 6667                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6668                            ],
 6669                        },
 6670                        {
 6671                            "type": "contains",
 6672                            "value": "MODIFIER",
 6673                            "score": 0,
 6674                            "flag": "FILTERED",
 6675                            "comment": [
 6676                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6677                            ],
 6678                        },
 6679                    ],
 6680                }
 6681            },
 6682        }
 6683
 6684        return config_default.get(name, None)
 6685
 6686    def get_config_json(
 6687        self, name: str, config_dict: dict = {}, config_file: str = None
 6688    ) -> dict:
 6689        """
 6690        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6691        default values, a dictionary, and a file.
 6692
 6693        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6694        the name of the configuration. It is used to identify and retrieve the configuration settings
 6695        for a specific component or module
 6696        :type name: str
 6697        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6698        dictionary that allows you to provide additional configuration settings or overrides. When you
 6699        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6700        the key is the configuration setting you want to override or
 6701        :type config_dict: dict
 6702        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6703        specify the path to a configuration file that contains additional settings. If provided, the
 6704        function will read the contents of this file and update the configuration dictionary with the
 6705        values found in the file, overriding any existing values with the
 6706        :type config_file: str
 6707        :return: The function `get_config_json` returns a dictionary containing the configuration
 6708        settings.
 6709        """
 6710
 6711        # Create with default prioritizations
 6712        config_default = self.get_config_default(name=name)
 6713        configuration = config_default
 6714        # log.debug(f"configuration={configuration}")
 6715
 6716        # Replace prioritizations from dict
 6717        for config in config_dict:
 6718            configuration[config] = config_dict[config]
 6719
 6720        # Replace prioritizations from file
 6721        config_file = full_path(config_file)
 6722        if config_file:
 6723            if os.path.exists(config_file):
 6724                with open(config_file) as config_file_content:
 6725                    config_file_dict = json.load(config_file_content)
 6726                for config in config_file_dict:
 6727                    configuration[config] = config_file_dict[config]
 6728            else:
 6729                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6730                log.error(msg_error)
 6731                raise ValueError(msg_error)
 6732
 6733        return configuration
 6734
 6735    def prioritization(
 6736        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6737    ) -> bool:
 6738        """
 6739        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6740        prioritizes variants based on configured profiles and criteria.
 6741
 6742        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6743        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6744        a table name is provided, the method will prioritize the variants in that specific table
 6745        :type table: str
 6746        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6747        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6748        provided, the code will use a default prefix value of "PZ"
 6749        :type pz_prefix: str
 6750        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6751        additional parameters specific to the prioritization process. These parameters can include
 6752        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6753        configurations needed for the prioritization of variants in a V
 6754        :type pz_param: dict
 6755        :return: A boolean value (True) is being returned from the `prioritization` function.
 6756        """
 6757
 6758        # Config
 6759        config = self.get_config()
 6760
 6761        # Param
 6762        param = self.get_param()
 6763
 6764        # Prioritization param
 6765        if pz_param is not None:
 6766            prioritization_param = pz_param
 6767        else:
 6768            prioritization_param = param.get("prioritization", {})
 6769
 6770        # Configuration profiles
 6771        prioritization_config_file = prioritization_param.get(
 6772            "prioritization_config", None
 6773        )
 6774        prioritization_config_file = full_path(prioritization_config_file)
 6775        prioritizations_config = self.get_config_json(
 6776            name="prioritizations", config_file=prioritization_config_file
 6777        )
 6778
 6779        # Prioritization prefix
 6780        pz_prefix_default = "PZ"
 6781        if pz_prefix is None:
 6782            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6783
 6784        # Prioritization options
 6785        profiles = prioritization_param.get("profiles", [])
 6786        if isinstance(profiles, str):
 6787            profiles = profiles.split(",")
 6788        pzfields = prioritization_param.get(
 6789            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6790        )
 6791        if isinstance(pzfields, str):
 6792            pzfields = pzfields.split(",")
 6793        default_profile = prioritization_param.get("default_profile", None)
 6794        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6795        prioritization_score_mode = prioritization_param.get(
 6796            "prioritization_score_mode", "HOWARD"
 6797        )
 6798
 6799        # Quick Prioritizations
 6800        prioritizations = param.get("prioritizations", None)
 6801        if prioritizations:
 6802            log.info("Quick Prioritization:")
 6803            for profile in prioritizations.split(","):
 6804                if profile not in profiles:
 6805                    profiles.append(profile)
 6806                    log.info(f"   {profile}")
 6807
 6808        # If profile "ALL" provided, all profiles in the config profiles
 6809        if "ALL" in profiles:
 6810            profiles = list(prioritizations_config.keys())
 6811
 6812        for profile in profiles:
 6813            if prioritizations_config.get(profile, None):
 6814                log.debug(f"Profile '{profile}' configured")
 6815            else:
 6816                msg_error = f"Profile '{profile}' NOT configured"
 6817                log.error(msg_error)
 6818                raise ValueError(msg_error)
 6819
 6820        if profiles:
 6821            log.info(f"Prioritization... ")
 6822        else:
 6823            log.debug(f"No profile defined")
 6824            return False
 6825
 6826        if not default_profile and len(profiles):
 6827            default_profile = profiles[0]
 6828
 6829        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6830        log.debug("Profiles to check: " + str(list(profiles)))
 6831
 6832        # Variables
 6833        if table is not None:
 6834            table_variants = table
 6835        else:
 6836            table_variants = self.get_table_variants(clause="update")
 6837        log.debug(f"Table to prioritize: {table_variants}")
 6838
 6839        # Added columns
 6840        added_columns = []
 6841
 6842        # Create list of PZfields
 6843        # List of PZFields
 6844        list_of_pzfields_original = pzfields + [
 6845            pzfield + pzfields_sep + profile
 6846            for pzfield in pzfields
 6847            for profile in profiles
 6848        ]
 6849        list_of_pzfields = []
 6850        log.debug(f"{list_of_pzfields_original}")
 6851
 6852        # Remove existing PZfields to use if exists
 6853        for pzfield in list_of_pzfields_original:
 6854            if self.get_header().infos.get(pzfield, None) is None:
 6855                list_of_pzfields.append(pzfield)
 6856                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6857            else:
 6858                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6859
 6860        if list_of_pzfields:
 6861
 6862            # Explode Infos prefix
 6863            explode_infos_prefix = self.get_explode_infos_prefix()
 6864
 6865            # PZfields tags description
 6866            PZfields_INFOS = {
 6867                f"{pz_prefix}Tags": {
 6868                    "ID": f"{pz_prefix}Tags",
 6869                    "Number": ".",
 6870                    "Type": "String",
 6871                    "Description": "Variant tags based on annotation criteria",
 6872                },
 6873                f"{pz_prefix}Score": {
 6874                    "ID": f"{pz_prefix}Score",
 6875                    "Number": 1,
 6876                    "Type": "Integer",
 6877                    "Description": "Variant score based on annotation criteria",
 6878                },
 6879                f"{pz_prefix}Flag": {
 6880                    "ID": f"{pz_prefix}Flag",
 6881                    "Number": 1,
 6882                    "Type": "String",
 6883                    "Description": "Variant flag based on annotation criteria",
 6884                },
 6885                f"{pz_prefix}Comment": {
 6886                    "ID": f"{pz_prefix}Comment",
 6887                    "Number": ".",
 6888                    "Type": "String",
 6889                    "Description": "Variant comment based on annotation criteria",
 6890                },
 6891                f"{pz_prefix}Infos": {
 6892                    "ID": f"{pz_prefix}Infos",
 6893                    "Number": ".",
 6894                    "Type": "String",
 6895                    "Description": "Variant infos based on annotation criteria",
 6896                },
 6897            }
 6898
 6899            # Create INFO fields if not exist
 6900            for field in PZfields_INFOS:
 6901                field_ID = PZfields_INFOS[field]["ID"]
 6902                field_description = PZfields_INFOS[field]["Description"]
 6903                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6904                    field_description = (
 6905                        PZfields_INFOS[field]["Description"]
 6906                        + f", profile {default_profile}"
 6907                    )
 6908                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6909                        field_ID,
 6910                        PZfields_INFOS[field]["Number"],
 6911                        PZfields_INFOS[field]["Type"],
 6912                        field_description,
 6913                        "unknown",
 6914                        "unknown",
 6915                        code_type_map[PZfields_INFOS[field]["Type"]],
 6916                    )
 6917
 6918            # Create INFO fields if not exist for each profile
 6919            for profile in prioritizations_config:
 6920                if profile in profiles or profiles == []:
 6921                    for field in PZfields_INFOS:
 6922                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6923                        field_description = (
 6924                            PZfields_INFOS[field]["Description"]
 6925                            + f", profile {profile}"
 6926                        )
 6927                        if (
 6928                            field_ID not in self.get_header().infos
 6929                            and field in pzfields
 6930                        ):
 6931                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6932                                field_ID,
 6933                                PZfields_INFOS[field]["Number"],
 6934                                PZfields_INFOS[field]["Type"],
 6935                                field_description,
 6936                                "unknown",
 6937                                "unknown",
 6938                                code_type_map[PZfields_INFOS[field]["Type"]],
 6939                            )
 6940
 6941            # Header
 6942            for pzfield in list_of_pzfields:
 6943                if re.match(f"{pz_prefix}Score.*", pzfield):
 6944                    added_column = self.add_column(
 6945                        table_name=table_variants,
 6946                        column_name=pzfield,
 6947                        column_type="INTEGER",
 6948                        default_value="0",
 6949                    )
 6950                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6951                    added_column = self.add_column(
 6952                        table_name=table_variants,
 6953                        column_name=pzfield,
 6954                        column_type="BOOLEAN",
 6955                        default_value="1",
 6956                    )
 6957                else:
 6958                    added_column = self.add_column(
 6959                        table_name=table_variants,
 6960                        column_name=pzfield,
 6961                        column_type="STRING",
 6962                        default_value="''",
 6963                    )
 6964                added_columns.append(added_column)
 6965
 6966            # Profiles
 6967            if profiles:
 6968
 6969                # foreach profile in configuration file
 6970                for profile in prioritizations_config:
 6971
 6972                    # If profile is asked in param, or ALL are asked (empty profile [])
 6973                    if profile in profiles or profiles == []:
 6974                        log.info(f"Profile '{profile}'")
 6975
 6976                        sql_set_info_option = ""
 6977
 6978                        sql_set_info = []
 6979
 6980                        # PZ fields set
 6981
 6982                        # PZScore
 6983                        if (
 6984                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6985                            in list_of_pzfields
 6986                        ):
 6987                            sql_set_info.append(
 6988                                f"""
 6989                                    concat(
 6990                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6991                                        {pz_prefix}Score{pzfields_sep}{profile}
 6992                                    ) 
 6993                                """
 6994                            )
 6995                            if (
 6996                                profile == default_profile
 6997                                and f"{pz_prefix}Score" in list_of_pzfields
 6998                            ):
 6999                                sql_set_info.append(
 7000                                    f"""
 7001                                        concat(
 7002                                            '{pz_prefix}Score=',
 7003                                            {pz_prefix}Score{pzfields_sep}{profile}
 7004                                        )
 7005                                    """
 7006                                )
 7007
 7008                        # PZFlag
 7009                        if (
 7010                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7011                            in list_of_pzfields
 7012                        ):
 7013                            sql_set_info.append(
 7014                                f"""
 7015                                    concat(
 7016                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7017                                        CASE 
 7018                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7019                                            THEN 'PASS'
 7020                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7021                                            THEN 'FILTERED'
 7022                                        END
 7023                                    ) 
 7024                                """
 7025                            )
 7026                            if (
 7027                                profile == default_profile
 7028                                and f"{pz_prefix}Flag" in list_of_pzfields
 7029                            ):
 7030                                sql_set_info.append(
 7031                                    f"""
 7032                                        concat(
 7033                                            '{pz_prefix}Flag=',
 7034                                            CASE 
 7035                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7036                                                THEN 'PASS'
 7037                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7038                                                THEN 'FILTERED'
 7039                                            END
 7040                                        )
 7041                                    """
 7042                                )
 7043
 7044                        # PZComment
 7045                        if (
 7046                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7047                            in list_of_pzfields
 7048                        ):
 7049                            sql_set_info.append(
 7050                                f"""
 7051                                    CASE
 7052                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7053                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7054                                        ELSE ''
 7055                                    END
 7056                                """
 7057                            )
 7058                            if (
 7059                                profile == default_profile
 7060                                and f"{pz_prefix}Comment" in list_of_pzfields
 7061                            ):
 7062                                sql_set_info.append(
 7063                                    f"""
 7064                                        CASE
 7065                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7066                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7067                                            ELSE ''
 7068                                        END
 7069                                    """
 7070                                )
 7071
 7072                        # PZInfos
 7073                        if (
 7074                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7075                            in list_of_pzfields
 7076                        ):
 7077                            sql_set_info.append(
 7078                                f"""
 7079                                    CASE
 7080                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7081                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7082                                        ELSE ''
 7083                                    END
 7084                                """
 7085                            )
 7086                            if (
 7087                                profile == default_profile
 7088                                and f"{pz_prefix}Infos" in list_of_pzfields
 7089                            ):
 7090                                sql_set_info.append(
 7091                                    f"""
 7092                                        CASE
 7093                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7094                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7095                                            ELSE ''
 7096                                        END
 7097                                    """
 7098                                )
 7099
 7100                        # Merge PZfields
 7101                        sql_set_info_option = ""
 7102                        sql_set_sep = ""
 7103                        for sql_set in sql_set_info:
 7104                            if sql_set_sep:
 7105                                sql_set_info_option += f"""
 7106                                    , concat('{sql_set_sep}', {sql_set})
 7107                                """
 7108                            else:
 7109                                sql_set_info_option += f"""
 7110                                    , {sql_set}
 7111                                """
 7112                            sql_set_sep = ";"
 7113
 7114                        sql_queries = []
 7115                        for annotation in prioritizations_config[profile]:
 7116
 7117                            # Explode specific annotation
 7118                            log.debug(f"Explode annotation '{annotation}'")
 7119                            added_columns += self.explode_infos(
 7120                                prefix=explode_infos_prefix,
 7121                                fields=[annotation],
 7122                                table=table_variants,
 7123                            )
 7124                            extra_infos = self.get_extra_infos(table=table_variants)
 7125
 7126                            # Check if annotation field is present
 7127                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
 7128                                log.debug(f"Annotation '{annotation}' not in data")
 7129                                continue
 7130                            else:
 7131                                log.debug(f"Annotation '{annotation}' in data")
 7132
 7133                            # For each criterions
 7134                            for criterion in prioritizations_config[profile][
 7135                                annotation
 7136                            ]:
 7137                                criterion_type = criterion["type"]
 7138                                criterion_value = criterion["value"]
 7139                                criterion_score = criterion.get("score", 0)
 7140                                criterion_flag = criterion.get("flag", "PASS")
 7141                                criterion_flag_bool = criterion_flag == "PASS"
 7142                                criterion_comment = (
 7143                                    ", ".join(criterion.get("comment", []))
 7144                                    .replace("'", "''")
 7145                                    .replace(";", ",")
 7146                                    .replace("\t", " ")
 7147                                )
 7148                                criterion_infos = (
 7149                                    str(criterion)
 7150                                    .replace("'", "''")
 7151                                    .replace(";", ",")
 7152                                    .replace("\t", " ")
 7153                                )
 7154
 7155                                sql_set = []
 7156                                sql_set_info = []
 7157
 7158                                # PZ fields set
 7159                                if (
 7160                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7161                                    in list_of_pzfields
 7162                                ):
 7163                                    if prioritization_score_mode == "HOWARD":
 7164                                        sql_set.append(
 7165                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7166                                        )
 7167                                    elif prioritization_score_mode == "VaRank":
 7168                                        sql_set.append(
 7169                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7170                                        )
 7171                                    else:
 7172                                        sql_set.append(
 7173                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7174                                        )
 7175                                if (
 7176                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7177                                    in list_of_pzfields
 7178                                ):
 7179                                    sql_set.append(
 7180                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7181                                    )
 7182                                if (
 7183                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7184                                    in list_of_pzfields
 7185                                ):
 7186                                    sql_set.append(
 7187                                        f"""
 7188                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7189                                                concat(
 7190                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7191                                                    CASE 
 7192                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7193                                                        THEN ', '
 7194                                                        ELSE ''
 7195                                                    END,
 7196                                                    '{criterion_comment}'
 7197                                                )
 7198                                        """
 7199                                    )
 7200                                if (
 7201                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7202                                    in list_of_pzfields
 7203                                ):
 7204                                    sql_set.append(
 7205                                        f"""
 7206                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7207                                                concat(
 7208                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7209                                                    '{criterion_infos}'
 7210                                                )
 7211                                        """
 7212                                    )
 7213                                sql_set_option = ",".join(sql_set)
 7214
 7215                                # Criterion and comparison
 7216                                if sql_set_option:
 7217                                    try:
 7218                                        float(criterion_value)
 7219                                        sql_update = f"""
 7220                                            UPDATE {table_variants}
 7221                                            SET {sql_set_option}
 7222                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7223                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7224                                            """
 7225                                    except:
 7226                                        contains_option = ""
 7227                                        if criterion_type == "contains":
 7228                                            contains_option = ".*"
 7229                                        sql_update = f"""
 7230                                            UPDATE {table_variants}
 7231                                            SET {sql_set_option}
 7232                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7233                                            """
 7234                                    sql_queries.append(sql_update)
 7235                                else:
 7236                                    log.warning(
 7237                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7238                                    )
 7239
 7240                        # PZTags
 7241                        if (
 7242                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7243                            in list_of_pzfields
 7244                        ):
 7245
 7246                            # Create PZFalgs value
 7247                            pztags_value = ""
 7248                            pztags_sep_default = "|"
 7249                            pztags_sep = ""
 7250                            for pzfield in pzfields:
 7251                                if pzfield not in [f"{pz_prefix}Tags"]:
 7252                                    if (
 7253                                        f"{pzfield}{pzfields_sep}{profile}"
 7254                                        in list_of_pzfields
 7255                                    ):
 7256                                        if pzfield in [f"{pz_prefix}Flag"]:
 7257                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7258                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7259                                                    THEN 'PASS'
 7260                                                    ELSE 'FILTERED'
 7261                                                END, '"""
 7262                                        else:
 7263                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7264                                        pztags_sep = pztags_sep_default
 7265
 7266                            # Add Query update for PZFlags
 7267                            sql_update_pztags = f"""
 7268                                UPDATE {table_variants}
 7269                                SET INFO = concat(
 7270                                        INFO,
 7271                                        CASE WHEN INFO NOT in ('','.')
 7272                                                THEN ';'
 7273                                                ELSE ''
 7274                                        END,
 7275                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7276                                    )
 7277                                """
 7278                            sql_queries.append(sql_update_pztags)
 7279
 7280                            # Add Query update for PZFlags for default
 7281                            if profile == default_profile:
 7282                                sql_update_pztags_default = f"""
 7283                                UPDATE {table_variants}
 7284                                SET INFO = concat(
 7285                                        INFO,
 7286                                        ';',
 7287                                        '{pz_prefix}Tags={pztags_value}'
 7288                                    )
 7289                                """
 7290                                sql_queries.append(sql_update_pztags_default)
 7291
 7292                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7293
 7294                        if sql_queries:
 7295
 7296                            for sql_query in sql_queries:
 7297                                log.debug(
 7298                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7299                                )
 7300                                self.conn.execute(sql_query)
 7301
 7302                        log.info(f"""Profile '{profile}' - Update... """)
 7303                        sql_query_update = f"""
 7304                            UPDATE {table_variants}
 7305                            SET INFO =  
 7306                                concat(
 7307                                    CASE
 7308                                        WHEN INFO NOT IN ('','.')
 7309                                        THEN concat(INFO, ';')
 7310                                        ELSE ''
 7311                                    END
 7312                                    {sql_set_info_option}
 7313                                )
 7314                        """
 7315                        self.conn.execute(sql_query_update)
 7316
 7317        else:
 7318
 7319            log.warning(f"No profiles in parameters")
 7320
 7321        # Remove added columns
 7322        for added_column in added_columns:
 7323            self.drop_column(column=added_column)
 7324
 7325        # Explode INFOS fields into table fields
 7326        if self.get_explode_infos():
 7327            self.explode_infos(
 7328                prefix=self.get_explode_infos_prefix(),
 7329                fields=self.get_explode_infos_fields(),
 7330                force=True,
 7331            )
 7332
 7333        return True
 7334
 7335    ###
 7336    # HGVS
 7337    ###
 7338
 7339    def annotation_hgvs(self, threads: int = None) -> None:
 7340        """
 7341        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7342        coordinates and alleles.
 7343
 7344        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7345        threads to use for parallel processing. If no value is provided, it will default to the number
 7346        of threads obtained from the `get_threads()` method
 7347        :type threads: int
 7348        """
 7349
 7350        # Function for each partition of the Dask Dataframe
 7351        def partition_function(partition):
 7352            """
 7353            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7354            each row of a DataFrame called `partition`.
 7355
 7356            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7357            to be processed
 7358            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7359            the "partition" dataframe along the axis 1.
 7360            """
 7361            return partition.apply(annotation_hgvs_partition, axis=1)
 7362
 7363        def annotation_hgvs_partition(row) -> str:
 7364            """
 7365            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7366            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7367
 7368            :param row: A dictionary-like object that contains the values for the following keys:
 7369            :return: a string that contains the HGVS names associated with the given row of data.
 7370            """
 7371
 7372            chr = row["CHROM"]
 7373            pos = row["POS"]
 7374            ref = row["REF"]
 7375            alt = row["ALT"]
 7376
 7377            # Find list of associated transcripts
 7378            transcripts_list = list(
 7379                polars_conn.execute(
 7380                    f"""
 7381                SELECT transcript
 7382                FROM refseq_df
 7383                WHERE CHROM='{chr}'
 7384                AND POS={pos}
 7385            """
 7386                )["transcript"]
 7387            )
 7388
 7389            # Full HGVS annotation in list
 7390            hgvs_full_list = []
 7391
 7392            for transcript_name in transcripts_list:
 7393
 7394                # Transcript
 7395                transcript = get_transcript(
 7396                    transcripts=transcripts, transcript_name=transcript_name
 7397                )
 7398                # Exon
 7399                if use_exon:
 7400                    exon = transcript.find_exon_number(pos)
 7401                else:
 7402                    exon = None
 7403                # Protein
 7404                transcript_protein = None
 7405                if use_protein or add_protein or full_format:
 7406                    transcripts_protein = list(
 7407                        polars_conn.execute(
 7408                            f"""
 7409                        SELECT protein
 7410                        FROM refseqlink_df
 7411                        WHERE transcript='{transcript_name}'
 7412                        LIMIT 1
 7413                    """
 7414                        )["protein"]
 7415                    )
 7416                    if len(transcripts_protein):
 7417                        transcript_protein = transcripts_protein[0]
 7418
 7419                # HGVS name
 7420                hgvs_name = format_hgvs_name(
 7421                    chr,
 7422                    pos,
 7423                    ref,
 7424                    alt,
 7425                    genome=genome,
 7426                    transcript=transcript,
 7427                    transcript_protein=transcript_protein,
 7428                    exon=exon,
 7429                    use_gene=use_gene,
 7430                    use_protein=use_protein,
 7431                    full_format=full_format,
 7432                    use_version=use_version,
 7433                    codon_type=codon_type,
 7434                )
 7435                hgvs_full_list.append(hgvs_name)
 7436                if add_protein and not use_protein and not full_format:
 7437                    hgvs_name = format_hgvs_name(
 7438                        chr,
 7439                        pos,
 7440                        ref,
 7441                        alt,
 7442                        genome=genome,
 7443                        transcript=transcript,
 7444                        transcript_protein=transcript_protein,
 7445                        exon=exon,
 7446                        use_gene=use_gene,
 7447                        use_protein=True,
 7448                        full_format=False,
 7449                        use_version=use_version,
 7450                        codon_type=codon_type,
 7451                    )
 7452                    hgvs_full_list.append(hgvs_name)
 7453
 7454            # Create liste of HGVS annotations
 7455            hgvs_full = ",".join(hgvs_full_list)
 7456
 7457            return hgvs_full
 7458
 7459        # Polars connexion
 7460        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7461
 7462        # Config
 7463        config = self.get_config()
 7464
 7465        # Databases
 7466        # Genome
 7467        databases_genomes_folders = (
 7468            config.get("folders", {})
 7469            .get("databases", {})
 7470            .get("genomes", DEFAULT_GENOME_FOLDER)
 7471        )
 7472        databases_genome = (
 7473            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7474        )
 7475        # refseq database folder
 7476        databases_refseq_folders = (
 7477            config.get("folders", {})
 7478            .get("databases", {})
 7479            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7480        )
 7481        # refseq
 7482        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7483        # refSeqLink
 7484        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7485
 7486        # Param
 7487        param = self.get_param()
 7488
 7489        # Quick HGVS
 7490        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7491            log.info(f"Quick HGVS Annotation:")
 7492            if not param.get("hgvs", None):
 7493                param["hgvs"] = {}
 7494            for option in param.get("hgvs_options", "").split(","):
 7495                option_var_val = option.split("=")
 7496                option_var = option_var_val[0]
 7497                if len(option_var_val) > 1:
 7498                    option_val = option_var_val[1]
 7499                else:
 7500                    option_val = "True"
 7501                if option_val.upper() in ["TRUE"]:
 7502                    option_val = True
 7503                elif option_val.upper() in ["FALSE"]:
 7504                    option_val = False
 7505                log.info(f"   {option_var}={option_val}")
 7506                param["hgvs"][option_var] = option_val
 7507
 7508        # Check if HGVS annotation enabled
 7509        if "hgvs" in param:
 7510            log.info(f"HGVS Annotation... ")
 7511            for hgvs_option in param.get("hgvs", {}):
 7512                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7513        else:
 7514            return
 7515
 7516        # HGVS Param
 7517        param_hgvs = param.get("hgvs", {})
 7518        use_exon = param_hgvs.get("use_exon", False)
 7519        use_gene = param_hgvs.get("use_gene", False)
 7520        use_protein = param_hgvs.get("use_protein", False)
 7521        add_protein = param_hgvs.get("add_protein", False)
 7522        full_format = param_hgvs.get("full_format", False)
 7523        use_version = param_hgvs.get("use_version", False)
 7524        codon_type = param_hgvs.get("codon_type", "3")
 7525
 7526        # refSseq refSeqLink
 7527        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7528        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7529
 7530        # Assembly
 7531        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7532
 7533        # Genome
 7534        genome_file = None
 7535        if find_genome(databases_genome):
 7536            genome_file = find_genome(databases_genome)
 7537        else:
 7538            genome_file = find_genome(
 7539                genome_path=databases_genomes_folders, assembly=assembly
 7540            )
 7541        log.debug("Genome: " + str(genome_file))
 7542
 7543        # refSseq
 7544        refseq_file = find_file_prefix(
 7545            input_file=databases_refseq,
 7546            prefix="ncbiRefSeq",
 7547            folder=databases_refseq_folders,
 7548            assembly=assembly,
 7549        )
 7550        log.debug("refSeq: " + str(refseq_file))
 7551
 7552        # refSeqLink
 7553        refseqlink_file = find_file_prefix(
 7554            input_file=databases_refseqlink,
 7555            prefix="ncbiRefSeqLink",
 7556            folder=databases_refseq_folders,
 7557            assembly=assembly,
 7558        )
 7559        log.debug("refSeqLink: " + str(refseqlink_file))
 7560
 7561        # Threads
 7562        if not threads:
 7563            threads = self.get_threads()
 7564        log.debug("Threads: " + str(threads))
 7565
 7566        # Variables
 7567        table_variants = self.get_table_variants(clause="update")
 7568
 7569        # Get variants SNV and InDel only
 7570        query_variants = f"""
 7571            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7572            FROM {table_variants}
 7573            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7574            """
 7575        df_variants = self.get_query_to_df(query_variants)
 7576
 7577        # Added columns
 7578        added_columns = []
 7579
 7580        # Add hgvs column in variants table
 7581        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7582        added_column = self.add_column(
 7583            table_variants, hgvs_column_name, "STRING", default_value=None
 7584        )
 7585        added_columns.append(added_column)
 7586
 7587        log.debug(f"refSeq loading...")
 7588        # refSeq in duckDB
 7589        refseq_table = get_refseq_table(
 7590            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7591        )
 7592        # Loading all refSeq in Dataframe
 7593        refseq_query = f"""
 7594            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7595            FROM {refseq_table}
 7596            JOIN df_variants ON (
 7597                {refseq_table}.chrom = df_variants.CHROM
 7598                AND {refseq_table}.txStart<=df_variants.POS
 7599                AND {refseq_table}.txEnd>=df_variants.POS
 7600            )
 7601        """
 7602        refseq_df = self.conn.query(refseq_query).pl()
 7603
 7604        if refseqlink_file:
 7605            log.debug(f"refSeqLink loading...")
 7606            # refSeqLink in duckDB
 7607            refseqlink_table = get_refseq_table(
 7608                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7609            )
 7610            # Loading all refSeqLink in Dataframe
 7611            protacc_column = "protAcc_with_ver"
 7612            mrnaacc_column = "mrnaAcc_with_ver"
 7613            refseqlink_query = f"""
 7614                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7615                FROM {refseqlink_table} 
 7616                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7617                WHERE protAcc_without_ver IS NOT NULL
 7618            """
 7619            # Polars Dataframe
 7620            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7621
 7622        # Read RefSeq transcripts into a python dict/model.
 7623        log.debug(f"Transcripts loading...")
 7624        with tempfile.TemporaryDirectory() as tmpdir:
 7625            transcripts_query = f"""
 7626                COPY (
 7627                    SELECT {refseq_table}.*
 7628                    FROM {refseq_table}
 7629                    JOIN df_variants ON (
 7630                        {refseq_table}.chrom=df_variants.CHROM
 7631                        AND {refseq_table}.txStart<=df_variants.POS
 7632                        AND {refseq_table}.txEnd>=df_variants.POS
 7633                    )
 7634                )
 7635                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7636            """
 7637            self.conn.query(transcripts_query)
 7638            with open(f"{tmpdir}/transcript.tsv") as infile:
 7639                transcripts = read_transcripts(infile)
 7640
 7641        # Polars connexion
 7642        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7643
 7644        log.debug("Genome loading...")
 7645        # Read genome sequence using pyfaidx.
 7646        genome = Fasta(genome_file)
 7647
 7648        log.debug("Start annotation HGVS...")
 7649
 7650        # Create
 7651        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7652        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7653
 7654        # Use dask.dataframe.apply() to apply function on each partition
 7655        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7656
 7657        # Convert Dask DataFrame to Pandas Dataframe
 7658        df = ddf.compute()
 7659
 7660        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7661        with tempfile.TemporaryDirectory() as tmpdir:
 7662            df_parquet = os.path.join(tmpdir, "df.parquet")
 7663            df.to_parquet(df_parquet)
 7664
 7665            # Update hgvs column
 7666            update_variant_query = f"""
 7667                UPDATE {table_variants}
 7668                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7669                FROM read_parquet('{df_parquet}') as df
 7670                WHERE variants."#CHROM" = df.CHROM
 7671                AND variants.POS = df.POS
 7672                AND variants.REF = df.REF
 7673                AND variants.ALT = df.ALT
 7674                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7675                """
 7676            self.execute_query(update_variant_query)
 7677
 7678        # Update INFO column
 7679        sql_query_update = f"""
 7680            UPDATE {table_variants}
 7681            SET INFO = 
 7682                concat(
 7683                    CASE 
 7684                        WHEN INFO NOT IN ('','.')
 7685                        THEN concat(INFO, ';')
 7686                        ELSE ''
 7687                    END,
 7688                    'hgvs=',
 7689                    {hgvs_column_name}
 7690                )
 7691            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7692            """
 7693        self.execute_query(sql_query_update)
 7694
 7695        # Add header
 7696        HGVS_INFOS = {
 7697            "hgvs": {
 7698                "ID": "hgvs",
 7699                "Number": ".",
 7700                "Type": "String",
 7701                "Description": f"HGVS annotatation with HOWARD",
 7702            }
 7703        }
 7704
 7705        for field in HGVS_INFOS:
 7706            field_ID = HGVS_INFOS[field]["ID"]
 7707            field_description = HGVS_INFOS[field]["Description"]
 7708            self.get_header().infos[field_ID] = vcf.parser._Info(
 7709                field_ID,
 7710                HGVS_INFOS[field]["Number"],
 7711                HGVS_INFOS[field]["Type"],
 7712                field_description,
 7713                "unknown",
 7714                "unknown",
 7715                code_type_map[HGVS_INFOS[field]["Type"]],
 7716            )
 7717
 7718        # Remove added columns
 7719        for added_column in added_columns:
 7720            self.drop_column(column=added_column)
 7721
 7722    ###
 7723    # Calculation
 7724    ###
 7725
 7726    def get_operations_help(
 7727        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7728    ) -> list:
 7729
 7730        # Init
 7731        operations_help = []
 7732
 7733        # operations
 7734        operations = self.get_config_json(
 7735            name="calculations",
 7736            config_dict=operations_config_dict,
 7737            config_file=operations_config_file,
 7738        )
 7739        for op in operations:
 7740            op_name = operations[op].get("name", op).upper()
 7741            op_description = operations[op].get("description", op_name)
 7742            op_available = operations[op].get("available", False)
 7743            if op_available:
 7744                operations_help.append(f"   {op_name}: {op_description}")
 7745
 7746        # Sort operations
 7747        operations_help.sort()
 7748
 7749        # insert header
 7750        operations_help.insert(0, "Available calculation operations:")
 7751
 7752        # Return
 7753        return operations_help
 7754
 7755    def calculation(
 7756        self,
 7757        operations: dict = {},
 7758        operations_config_dict: dict = {},
 7759        operations_config_file: str = None,
 7760    ) -> None:
 7761        """
 7762        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7763        operation, and then calls the appropriate function
 7764
 7765        param json example:
 7766            "calculation": {
 7767                "NOMEN": {
 7768                    "options": {
 7769                        "hgvs_field": "hgvs"
 7770                    },
 7771                "middle" : null
 7772            }
 7773        """
 7774
 7775        # Param
 7776        param = self.get_param()
 7777
 7778        # operations config
 7779        operations_config = self.get_config_json(
 7780            name="calculations",
 7781            config_dict=operations_config_dict,
 7782            config_file=operations_config_file,
 7783        )
 7784
 7785        # Upper keys
 7786        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7787
 7788        # Calculations
 7789
 7790        # Operations from param
 7791        operations = param.get("calculation", {}).get("calculations", operations)
 7792
 7793        # Quick calculation - add
 7794        if param.get("calculations", None):
 7795            calculations_list = [
 7796                value for value in param.get("calculations", "").split(",")
 7797            ]
 7798            log.info(f"Quick Calculations:")
 7799            for calculation_key in calculations_list:
 7800                log.info(f"   {calculation_key}")
 7801            for calculation_operation in calculations_list:
 7802                if calculation_operation.upper() not in operations:
 7803                    operations[calculation_operation.upper()] = {}
 7804                    add_value_into_dict(
 7805                        dict_tree=param,
 7806                        sections=[
 7807                            "calculation",
 7808                            "calculations",
 7809                            calculation_operation.upper(),
 7810                        ],
 7811                        value={},
 7812                    )
 7813
 7814        # Operations for calculation
 7815        if not operations:
 7816            operations = param.get("calculation", {}).get("calculations", {})
 7817
 7818        if operations:
 7819            log.info(f"Calculations...")
 7820
 7821        # For each operations
 7822        for operation_name in operations:
 7823            operation_name = operation_name.upper()
 7824            if operation_name not in [""]:
 7825                if operation_name in operations_config:
 7826                    log.info(f"Calculation '{operation_name}'")
 7827                    operation = operations_config[operation_name]
 7828                    operation_type = operation.get("type", "sql")
 7829                    if operation_type == "python":
 7830                        self.calculation_process_function(
 7831                            operation=operation, operation_name=operation_name
 7832                        )
 7833                    elif operation_type == "sql":
 7834                        self.calculation_process_sql(
 7835                            operation=operation, operation_name=operation_name
 7836                        )
 7837                    else:
 7838                        log.error(
 7839                            f"Operations config: Type '{operation_type}' NOT available"
 7840                        )
 7841                        raise ValueError(
 7842                            f"Operations config: Type '{operation_type}' NOT available"
 7843                        )
 7844                else:
 7845                    log.error(
 7846                        f"Operations config: Calculation '{operation_name}' NOT available"
 7847                    )
 7848                    raise ValueError(
 7849                        f"Operations config: Calculation '{operation_name}' NOT available"
 7850                    )
 7851
 7852        # Explode INFOS fields into table fields
 7853        if self.get_explode_infos():
 7854            self.explode_infos(
 7855                prefix=self.get_explode_infos_prefix(),
 7856                fields=self.get_explode_infos_fields(),
 7857                force=True,
 7858            )
 7859
 7860    def calculation_process_sql(
 7861        self, operation: dict, operation_name: str = "unknown"
 7862    ) -> None:
 7863        """
 7864        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7865        performs the operation, updating the specified table with the result.
 7866
 7867        :param operation: The `operation` parameter is a dictionary that contains information about the
 7868        mathematical operation to be performed. It includes the following keys:
 7869        :type operation: dict
 7870        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7871        the mathematical operation being performed. It is used for logging and error handling purposes,
 7872        defaults to unknown
 7873        :type operation_name: str (optional)
 7874        """
 7875
 7876        # table variants
 7877        table_variants = self.get_table_variants(clause="alter")
 7878
 7879        # Operation infos
 7880        operation_name = operation.get("name", "unknown")
 7881        log.debug(f"process sql {operation_name}")
 7882        output_column_name = operation.get("output_column_name", operation_name)
 7883        output_column_type = operation.get("output_column_type", "String")
 7884        prefix = operation.get("explode_infos_prefix", "")
 7885        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7886        output_column_description = operation.get(
 7887            "output_column_description", f"{operation_name} operation"
 7888        )
 7889        operation_query = operation.get("operation_query", None)
 7890        if isinstance(operation_query, list):
 7891            operation_query = " ".join(operation_query)
 7892        operation_info_fields = operation.get("info_fields", [])
 7893        operation_info_fields_check = operation.get("info_fields_check", False)
 7894        operation_info = operation.get("operation_info", True)
 7895
 7896        if operation_query:
 7897
 7898            # Info fields check
 7899            operation_info_fields_check_result = True
 7900            if operation_info_fields_check:
 7901                header_infos = self.get_header().infos
 7902                for info_field in operation_info_fields:
 7903                    operation_info_fields_check_result = (
 7904                        operation_info_fields_check_result
 7905                        and info_field in header_infos
 7906                    )
 7907
 7908            # If info fields available
 7909            if operation_info_fields_check_result:
 7910
 7911                # Added_columns
 7912                added_columns = []
 7913
 7914                # Create VCF header field
 7915                vcf_reader = self.get_header()
 7916                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 7917                    output_column_name,
 7918                    ".",
 7919                    output_column_type,
 7920                    output_column_description,
 7921                    "howard calculation",
 7922                    "0",
 7923                    self.code_type_map.get(output_column_type),
 7924                )
 7925
 7926                # Explode infos if needed
 7927                log.debug(f"calculation_process_sql prefix {prefix}")
 7928                added_columns += self.explode_infos(
 7929                    prefix=prefix,
 7930                    fields=[output_column_name] + operation_info_fields,
 7931                    force=True,
 7932                )
 7933
 7934                # Create column
 7935                added_column = self.add_column(
 7936                    table_name=table_variants,
 7937                    column_name=prefix + output_column_name,
 7938                    column_type=output_column_type_sql,
 7939                    default_value="null",
 7940                )
 7941                added_columns.append(added_column)
 7942
 7943                # Operation calculation
 7944                try:
 7945
 7946                    # Query to update calculation column
 7947                    sql_update = f"""
 7948                        UPDATE {table_variants}
 7949                        SET "{prefix}{output_column_name}" = ({operation_query})
 7950                    """
 7951                    self.conn.execute(sql_update)
 7952
 7953                    # Add to INFO
 7954                    if operation_info:
 7955                        sql_update_info = f"""
 7956                            UPDATE {table_variants}
 7957                            SET "INFO" =
 7958                                concat(
 7959                                    CASE
 7960                                        WHEN "INFO" IS NOT NULL
 7961                                        THEN concat("INFO", ';')
 7962                                        ELSE ''
 7963                                    END,
 7964                                    '{output_column_name}=',
 7965                                    "{prefix}{output_column_name}"
 7966                                )
 7967                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 7968                        """
 7969                        self.conn.execute(sql_update_info)
 7970
 7971                except:
 7972                    log.error(
 7973                        f"Operations config: Calculation '{operation_name}' query failed"
 7974                    )
 7975                    raise ValueError(
 7976                        f"Operations config: Calculation '{operation_name}' query failed"
 7977                    )
 7978
 7979                # Remove added columns
 7980                for added_column in added_columns:
 7981                    log.debug(f"added_column: {added_column}")
 7982                    self.drop_column(column=added_column)
 7983
 7984            else:
 7985                log.error(
 7986                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7987                )
 7988                raise ValueError(
 7989                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7990                )
 7991
 7992        else:
 7993            log.error(
 7994                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7995            )
 7996            raise ValueError(
 7997                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7998            )
 7999
 8000    def calculation_process_function(
 8001        self, operation: dict, operation_name: str = "unknown"
 8002    ) -> None:
 8003        """
 8004        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8005        function with the given parameters.
 8006
 8007        :param operation: The `operation` parameter is a dictionary that contains information about the
 8008        operation to be performed. It has the following keys:
 8009        :type operation: dict
 8010        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8011        the operation being performed. It is used for logging purposes, defaults to unknown
 8012        :type operation_name: str (optional)
 8013        """
 8014
 8015        operation_name = operation["name"]
 8016        log.debug(f"process sql {operation_name}")
 8017        function_name = operation["function_name"]
 8018        function_params = operation["function_params"]
 8019        getattr(self, function_name)(*function_params)
 8020
 8021    def calculation_variant_id(self) -> None:
 8022        """
 8023        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8024        updates the INFO field of a variants table with the variant ID.
 8025        """
 8026
 8027        # variant_id annotation field
 8028        variant_id_tag = self.get_variant_id_column()
 8029        added_columns = [variant_id_tag]
 8030
 8031        # variant_id hgvs tags"
 8032        vcf_infos_tags = {
 8033            variant_id_tag: "howard variant ID annotation",
 8034        }
 8035
 8036        # Variants table
 8037        table_variants = self.get_table_variants()
 8038
 8039        # Header
 8040        vcf_reader = self.get_header()
 8041
 8042        # Add variant_id to header
 8043        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8044            variant_id_tag,
 8045            ".",
 8046            "String",
 8047            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8048            "howard calculation",
 8049            "0",
 8050            self.code_type_map.get("String"),
 8051        )
 8052
 8053        # Update
 8054        sql_update = f"""
 8055            UPDATE {table_variants}
 8056            SET "INFO" = 
 8057                concat(
 8058                    CASE
 8059                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8060                        THEN ''
 8061                        ELSE concat("INFO", ';')
 8062                    END,
 8063                    '{variant_id_tag}=',
 8064                    "{variant_id_tag}"
 8065                )
 8066        """
 8067        self.conn.execute(sql_update)
 8068
 8069        # Remove added columns
 8070        for added_column in added_columns:
 8071            self.drop_column(column=added_column)
 8072
 8073    def calculation_extract_snpeff_hgvs(
 8074        self,
 8075        snpeff_hgvs: str = "snpeff_hgvs",
 8076        snpeff_field: str = "ANN",
 8077    ) -> None:
 8078        """
 8079        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8080        annotation field in a VCF file and adds them as a new column in the variants table.
 8081
 8082        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8083        function is used to specify the name of the column that will store the HGVS nomenclatures
 8084        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8085        snpeff_hgvs
 8086        :type snpeff_hgvs: str (optional)
 8087        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8088        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8089        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8090        to ANN
 8091        :type snpeff_field: str (optional)
 8092        """
 8093
 8094        # Snpeff hgvs tags
 8095        vcf_infos_tags = {
 8096            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8097        }
 8098
 8099        # Prefix
 8100        prefix = self.get_explode_infos_prefix()
 8101        if prefix:
 8102            prefix = "INFO/"
 8103
 8104        # snpEff fields
 8105        speff_ann_infos = prefix + snpeff_field
 8106        speff_hgvs_infos = prefix + snpeff_hgvs
 8107
 8108        # Variants table
 8109        table_variants = self.get_table_variants()
 8110
 8111        # Header
 8112        vcf_reader = self.get_header()
 8113
 8114        # Add columns
 8115        added_columns = []
 8116
 8117        # Explode HGVS field in column
 8118        added_columns += self.explode_infos(fields=[snpeff_field])
 8119
 8120        if snpeff_field in vcf_reader.infos:
 8121
 8122            log.debug(vcf_reader.infos[snpeff_field])
 8123
 8124            # Extract ANN header
 8125            ann_description = vcf_reader.infos[snpeff_field].desc
 8126            pattern = r"'(.+?)'"
 8127            match = re.search(pattern, ann_description)
 8128            if match:
 8129                ann_header_match = match.group(1).split(" | ")
 8130                ann_header_desc = {}
 8131                for i in range(len(ann_header_match)):
 8132                    ann_header_info = "".join(
 8133                        char for char in ann_header_match[i] if char.isalnum()
 8134                    )
 8135                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8136                if not ann_header_desc:
 8137                    raise ValueError("Invalid header description format")
 8138            else:
 8139                raise ValueError("Invalid header description format")
 8140
 8141            # Create variant id
 8142            variant_id_column = self.get_variant_id_column()
 8143            added_columns += [variant_id_column]
 8144
 8145            # Create dataframe
 8146            dataframe_snpeff_hgvs = self.get_query_to_df(
 8147                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8148            )
 8149
 8150            # Create main NOMEN column
 8151            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8152                speff_ann_infos
 8153            ].apply(
 8154                lambda x: extract_snpeff_hgvs(
 8155                    str(x), header=list(ann_header_desc.values())
 8156                )
 8157            )
 8158
 8159            # Add snpeff_hgvs to header
 8160            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8161                snpeff_hgvs,
 8162                ".",
 8163                "String",
 8164                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8165                "howard calculation",
 8166                "0",
 8167                self.code_type_map.get("String"),
 8168            )
 8169
 8170            # Update
 8171            sql_update = f"""
 8172                UPDATE variants
 8173                SET "INFO" = 
 8174                    concat(
 8175                        CASE
 8176                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8177                            THEN ''
 8178                            ELSE concat("INFO", ';')
 8179                        END,
 8180                        CASE 
 8181                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8182                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8183                            THEN concat(
 8184                                    '{snpeff_hgvs}=',
 8185                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8186                                )
 8187                            ELSE ''
 8188                        END
 8189                    )
 8190                FROM dataframe_snpeff_hgvs
 8191                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8192
 8193            """
 8194            self.conn.execute(sql_update)
 8195
 8196            # Delete dataframe
 8197            del dataframe_snpeff_hgvs
 8198            gc.collect()
 8199
 8200        else:
 8201
 8202            log.warning(
 8203                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8204            )
 8205
 8206        # Remove added columns
 8207        for added_column in added_columns:
 8208            self.drop_column(column=added_column)
 8209
 8210    def calculation_snpeff_ann_explode(
 8211        self,
 8212        uniquify: bool = True,
 8213        output_format: str = "fields",
 8214        output_prefix: str = "snpeff_",
 8215        snpeff_field: str = "ANN",
 8216    ) -> None:
 8217        """
 8218        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8219        exploding the HGVS field and updating variant information accordingly.
 8220
 8221        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8222        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8223        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8224        defaults to True
 8225        :type uniquify: bool (optional)
 8226        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8227        function specifies the format in which the output annotations will be generated. It has a
 8228        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8229        format, defaults to fields
 8230        :type output_format: str (optional)
 8231        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8232        method is used to specify the prefix that will be added to the output annotations generated
 8233        during the calculation process. This prefix helps to differentiate the newly added annotations
 8234        from existing ones in the output data. By default, the, defaults to ANN_
 8235        :type output_prefix: str (optional)
 8236        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8237        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8238        field will be processed to explode the HGVS annotations and update the variant information
 8239        accordingly, defaults to ANN
 8240        :type snpeff_field: str (optional)
 8241        """
 8242
 8243        # SnpEff annotation field
 8244        snpeff_hgvs = "snpeff_ann_explode"
 8245
 8246        # Snpeff hgvs tags
 8247        vcf_infos_tags = {
 8248            snpeff_hgvs: "Explode snpEff annotations",
 8249        }
 8250
 8251        # Prefix
 8252        prefix = self.get_explode_infos_prefix()
 8253        if prefix:
 8254            prefix = "INFO/"
 8255
 8256        # snpEff fields
 8257        speff_ann_infos = prefix + snpeff_field
 8258        speff_hgvs_infos = prefix + snpeff_hgvs
 8259
 8260        # Variants table
 8261        table_variants = self.get_table_variants()
 8262
 8263        # Header
 8264        vcf_reader = self.get_header()
 8265
 8266        # Add columns
 8267        added_columns = []
 8268
 8269        # Explode HGVS field in column
 8270        added_columns += self.explode_infos(fields=[snpeff_field])
 8271        log.debug(f"snpeff_field={snpeff_field}")
 8272        log.debug(f"added_columns={added_columns}")
 8273
 8274        if snpeff_field in vcf_reader.infos:
 8275
 8276            # Extract ANN header
 8277            ann_description = vcf_reader.infos[snpeff_field].desc
 8278            pattern = r"'(.+?)'"
 8279            match = re.search(pattern, ann_description)
 8280            if match:
 8281                ann_header_match = match.group(1).split(" | ")
 8282                ann_header = []
 8283                ann_header_desc = {}
 8284                for i in range(len(ann_header_match)):
 8285                    ann_header_info = "".join(
 8286                        char for char in ann_header_match[i] if char.isalnum()
 8287                    )
 8288                    ann_header.append(ann_header_info)
 8289                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8290                if not ann_header_desc:
 8291                    raise ValueError("Invalid header description format")
 8292            else:
 8293                raise ValueError("Invalid header description format")
 8294
 8295            # Create variant id
 8296            variant_id_column = self.get_variant_id_column()
 8297            added_columns += [variant_id_column]
 8298
 8299            # Create dataframe
 8300            dataframe_snpeff_hgvs = self.get_query_to_df(
 8301                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8302            )
 8303
 8304            # Create snpEff columns
 8305            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8306                speff_ann_infos
 8307            ].apply(
 8308                lambda x: explode_snpeff_ann(
 8309                    str(x),
 8310                    uniquify=uniquify,
 8311                    output_format=output_format,
 8312                    prefix=output_prefix,
 8313                    header=list(ann_header_desc.values()),
 8314                )
 8315            )
 8316
 8317            # Header
 8318            ann_annotations_prefix = ""
 8319            if output_format.upper() in ["JSON"]:
 8320                ann_annotations_prefix = f"{output_prefix}="
 8321                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8322                    output_prefix,
 8323                    ".",
 8324                    "String",
 8325                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8326                    + " - JSON format",
 8327                    "howard calculation",
 8328                    "0",
 8329                    self.code_type_map.get("String"),
 8330                )
 8331            else:
 8332                for ann_annotation in ann_header:
 8333                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8334                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8335                        ann_annotation_id,
 8336                        ".",
 8337                        "String",
 8338                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8339                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8340                        "howard calculation",
 8341                        "0",
 8342                        self.code_type_map.get("String"),
 8343                    )
 8344
 8345            # Update
 8346            sql_update = f"""
 8347                UPDATE variants
 8348                SET "INFO" = 
 8349                    concat(
 8350                        CASE
 8351                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8352                            THEN ''
 8353                            ELSE concat("INFO", ';')
 8354                        END,
 8355                        CASE 
 8356                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8357                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8358                            THEN concat(
 8359                                '{ann_annotations_prefix}',
 8360                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8361                                )
 8362                            ELSE ''
 8363                        END
 8364                    )
 8365                FROM dataframe_snpeff_hgvs
 8366                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8367
 8368            """
 8369            self.conn.execute(sql_update)
 8370
 8371            # Delete dataframe
 8372            del dataframe_snpeff_hgvs
 8373            gc.collect()
 8374
 8375        else:
 8376
 8377            log.warning(
 8378                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8379            )
 8380
 8381        # Remove added columns
 8382        for added_column in added_columns:
 8383            self.drop_column(column=added_column)
 8384
 8385    def calculation_extract_nomen(self) -> None:
 8386        """
 8387        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8388        """
 8389
 8390        # NOMEN field
 8391        field_nomen_dict = "NOMEN_DICT"
 8392
 8393        # NOMEN structure
 8394        nomen_dict = {
 8395            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8396            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8397            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8398            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8399            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8400            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8401            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8402            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8403            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8404            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8405        }
 8406
 8407        # Param
 8408        param = self.get_param()
 8409
 8410        # Prefix
 8411        prefix = self.get_explode_infos_prefix()
 8412
 8413        # Header
 8414        vcf_reader = self.get_header()
 8415
 8416        # Get HGVS field
 8417        hgvs_field = (
 8418            param.get("calculation", {})
 8419            .get("calculations", {})
 8420            .get("NOMEN", {})
 8421            .get("options", {})
 8422            .get("hgvs_field", "hgvs")
 8423        )
 8424
 8425        # Get transcripts
 8426        transcripts_file = (
 8427            param.get("calculation", {})
 8428            .get("calculations", {})
 8429            .get("NOMEN", {})
 8430            .get("options", {})
 8431            .get("transcripts", None)
 8432        )
 8433        transcripts_file = full_path(transcripts_file)
 8434        transcripts = []
 8435        if transcripts_file:
 8436            if os.path.exists(transcripts_file):
 8437                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8438                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8439            else:
 8440                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8441                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8442
 8443        # Added columns
 8444        added_columns = []
 8445
 8446        # Explode HGVS field in column
 8447        added_columns += self.explode_infos(fields=[hgvs_field])
 8448
 8449        # extra infos
 8450        extra_infos = self.get_extra_infos()
 8451        extra_field = prefix + hgvs_field
 8452
 8453        if extra_field in extra_infos:
 8454
 8455            # Create dataframe
 8456            dataframe_hgvs = self.get_query_to_df(
 8457                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8458            )
 8459
 8460            # Create main NOMEN column
 8461            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8462                lambda x: find_nomen(str(x), transcripts=transcripts)
 8463            )
 8464
 8465            # Explode NOMEN Structure and create SQL set for update
 8466            sql_nomen_fields = []
 8467            for nomen_field in nomen_dict:
 8468
 8469                # Explode each field into a column
 8470                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8471                    lambda x: dict(x).get(nomen_field, "")
 8472                )
 8473
 8474                # Create VCF header field
 8475                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8476                    nomen_field,
 8477                    ".",
 8478                    "String",
 8479                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8480                    "howard calculation",
 8481                    "0",
 8482                    self.code_type_map.get("String"),
 8483                )
 8484                sql_nomen_fields.append(
 8485                    f"""
 8486                        CASE 
 8487                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8488                            THEN concat(
 8489                                    ';{nomen_field}=',
 8490                                    dataframe_hgvs."{nomen_field}"
 8491                                )
 8492                            ELSE ''
 8493                        END
 8494                    """
 8495                )
 8496
 8497            # SQL set for update
 8498            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8499
 8500            # Update
 8501            sql_update = f"""
 8502                UPDATE variants
 8503                SET "INFO" = 
 8504                    concat(
 8505                        CASE
 8506                            WHEN "INFO" IS NULL
 8507                            THEN ''
 8508                            ELSE "INFO"
 8509                        END,
 8510                        {sql_nomen_fields_set}
 8511                    )
 8512                FROM dataframe_hgvs
 8513                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8514                    AND variants."POS" = dataframe_hgvs."POS" 
 8515                    AND variants."REF" = dataframe_hgvs."REF"
 8516                    AND variants."ALT" = dataframe_hgvs."ALT"
 8517            """
 8518            self.conn.execute(sql_update)
 8519
 8520            # Delete dataframe
 8521            del dataframe_hgvs
 8522            gc.collect()
 8523
 8524        # Remove added columns
 8525        for added_column in added_columns:
 8526            self.drop_column(column=added_column)
 8527
 8528    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8529        """
 8530        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8531        pipeline/sample for a variant and updates the variant information in a VCF file.
 8532
 8533        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8534        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8535        VCF header and to update the corresponding field in the variants table, defaults to
 8536        findbypipeline
 8537        :type tag: str (optional)
 8538        """
 8539
 8540        # if FORMAT and samples
 8541        if (
 8542            "FORMAT" in self.get_header_columns_as_list()
 8543            and self.get_header_sample_list()
 8544        ):
 8545
 8546            # findbypipeline annotation field
 8547            findbypipeline_tag = tag
 8548
 8549            # VCF infos tags
 8550            vcf_infos_tags = {
 8551                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8552            }
 8553
 8554            # Prefix
 8555            prefix = self.get_explode_infos_prefix()
 8556
 8557            # Field
 8558            findbypipeline_infos = prefix + findbypipeline_tag
 8559
 8560            # Variants table
 8561            table_variants = self.get_table_variants()
 8562
 8563            # Header
 8564            vcf_reader = self.get_header()
 8565
 8566            # Create variant id
 8567            variant_id_column = self.get_variant_id_column()
 8568            added_columns = [variant_id_column]
 8569
 8570            # variant_id, FORMAT and samples
 8571            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8572                self.get_header_sample_list()
 8573            )
 8574
 8575            # Create dataframe
 8576            dataframe_findbypipeline = self.get_query_to_df(
 8577                f""" SELECT {samples_fields} FROM {table_variants} """
 8578            )
 8579
 8580            # Create findbypipeline column
 8581            dataframe_findbypipeline[findbypipeline_infos] = (
 8582                dataframe_findbypipeline.apply(
 8583                    lambda row: findbypipeline(
 8584                        row, samples=self.get_header_sample_list()
 8585                    ),
 8586                    axis=1,
 8587                )
 8588            )
 8589
 8590            # Add snpeff_hgvs to header
 8591            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8592                findbypipeline_tag,
 8593                ".",
 8594                "String",
 8595                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8596                "howard calculation",
 8597                "0",
 8598                self.code_type_map.get("String"),
 8599            )
 8600
 8601            # Update
 8602            sql_update = f"""
 8603                UPDATE variants
 8604                SET "INFO" = 
 8605                    concat(
 8606                        CASE
 8607                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8608                            THEN ''
 8609                            ELSE concat("INFO", ';')
 8610                        END,
 8611                        CASE 
 8612                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8613                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8614                            THEN concat(
 8615                                    '{findbypipeline_tag}=',
 8616                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8617                                )
 8618                            ELSE ''
 8619                        END
 8620                    )
 8621                FROM dataframe_findbypipeline
 8622                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8623            """
 8624            self.conn.execute(sql_update)
 8625
 8626            # Remove added columns
 8627            for added_column in added_columns:
 8628                self.drop_column(column=added_column)
 8629
 8630            # Delete dataframe
 8631            del dataframe_findbypipeline
 8632            gc.collect()
 8633
 8634    def calculation_genotype_concordance(self) -> None:
 8635        """
 8636        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8637        multi-caller VCF files and updates the variant information in the database.
 8638        """
 8639
 8640        # if FORMAT and samples
 8641        if (
 8642            "FORMAT" in self.get_header_columns_as_list()
 8643            and self.get_header_sample_list()
 8644        ):
 8645
 8646            # genotypeconcordance annotation field
 8647            genotypeconcordance_tag = "genotypeconcordance"
 8648
 8649            # VCF infos tags
 8650            vcf_infos_tags = {
 8651                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8652            }
 8653
 8654            # Prefix
 8655            prefix = self.get_explode_infos_prefix()
 8656
 8657            # Field
 8658            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8659
 8660            # Variants table
 8661            table_variants = self.get_table_variants()
 8662
 8663            # Header
 8664            vcf_reader = self.get_header()
 8665
 8666            # Create variant id
 8667            variant_id_column = self.get_variant_id_column()
 8668            added_columns = [variant_id_column]
 8669
 8670            # variant_id, FORMAT and samples
 8671            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8672                self.get_header_sample_list()
 8673            )
 8674
 8675            # Create dataframe
 8676            dataframe_genotypeconcordance = self.get_query_to_df(
 8677                f""" SELECT {samples_fields} FROM {table_variants} """
 8678            )
 8679
 8680            # Create genotypeconcordance column
 8681            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8682                dataframe_genotypeconcordance.apply(
 8683                    lambda row: genotypeconcordance(
 8684                        row, samples=self.get_header_sample_list()
 8685                    ),
 8686                    axis=1,
 8687                )
 8688            )
 8689
 8690            # Add genotypeconcordance to header
 8691            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8692                genotypeconcordance_tag,
 8693                ".",
 8694                "String",
 8695                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8696                "howard calculation",
 8697                "0",
 8698                self.code_type_map.get("String"),
 8699            )
 8700
 8701            # Update
 8702            sql_update = f"""
 8703                UPDATE variants
 8704                SET "INFO" = 
 8705                    concat(
 8706                        CASE
 8707                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8708                            THEN ''
 8709                            ELSE concat("INFO", ';')
 8710                        END,
 8711                        CASE
 8712                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8713                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8714                            THEN concat(
 8715                                    '{genotypeconcordance_tag}=',
 8716                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8717                                )
 8718                            ELSE ''
 8719                        END
 8720                    )
 8721                FROM dataframe_genotypeconcordance
 8722                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8723            """
 8724            self.conn.execute(sql_update)
 8725
 8726            # Remove added columns
 8727            for added_column in added_columns:
 8728                self.drop_column(column=added_column)
 8729
 8730            # Delete dataframe
 8731            del dataframe_genotypeconcordance
 8732            gc.collect()
 8733
 8734    def calculation_barcode(self, tag: str = "barcode") -> None:
 8735        """
 8736        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8737        updates the INFO field in the file with the calculated barcode values.
 8738
 8739        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8740        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8741        the default tag name is set to "barcode", defaults to barcode
 8742        :type tag: str (optional)
 8743        """
 8744
 8745        # if FORMAT and samples
 8746        if (
 8747            "FORMAT" in self.get_header_columns_as_list()
 8748            and self.get_header_sample_list()
 8749        ):
 8750
 8751            # barcode annotation field
 8752            if not tag:
 8753                tag = "barcode"
 8754
 8755            # VCF infos tags
 8756            vcf_infos_tags = {
 8757                tag: "barcode calculation (VaRank)",
 8758            }
 8759
 8760            # Prefix
 8761            prefix = self.get_explode_infos_prefix()
 8762
 8763            # Field
 8764            barcode_infos = prefix + tag
 8765
 8766            # Variants table
 8767            table_variants = self.get_table_variants()
 8768
 8769            # Header
 8770            vcf_reader = self.get_header()
 8771
 8772            # Create variant id
 8773            variant_id_column = self.get_variant_id_column()
 8774            added_columns = [variant_id_column]
 8775
 8776            # variant_id, FORMAT and samples
 8777            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8778                self.get_header_sample_list()
 8779            )
 8780
 8781            # Create dataframe
 8782            dataframe_barcode = self.get_query_to_df(
 8783                f""" SELECT {samples_fields} FROM {table_variants} """
 8784            )
 8785
 8786            # Create barcode column
 8787            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8788                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8789            )
 8790
 8791            # Add barcode to header
 8792            vcf_reader.infos[tag] = vcf.parser._Info(
 8793                tag,
 8794                ".",
 8795                "String",
 8796                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8797                "howard calculation",
 8798                "0",
 8799                self.code_type_map.get("String"),
 8800            )
 8801
 8802            # Update
 8803            sql_update = f"""
 8804                UPDATE {table_variants}
 8805                SET "INFO" = 
 8806                    concat(
 8807                        CASE
 8808                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8809                            THEN ''
 8810                            ELSE concat("INFO", ';')
 8811                        END,
 8812                        CASE
 8813                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8814                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8815                            THEN concat(
 8816                                    '{tag}=',
 8817                                    dataframe_barcode."{barcode_infos}"
 8818                                )
 8819                            ELSE ''
 8820                        END
 8821                    )
 8822                FROM dataframe_barcode
 8823                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8824            """
 8825            self.conn.execute(sql_update)
 8826
 8827            # Remove added columns
 8828            for added_column in added_columns:
 8829                self.drop_column(column=added_column)
 8830
 8831            # Delete dataframe
 8832            del dataframe_barcode
 8833            gc.collect()
 8834
 8835    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8836        """
 8837        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8838        and updates the INFO field in the file with the calculated barcode values.
 8839
 8840        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8841        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8842        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8843        :type tag: str (optional)
 8844        """
 8845
 8846        # if FORMAT and samples
 8847        if (
 8848            "FORMAT" in self.get_header_columns_as_list()
 8849            and self.get_header_sample_list()
 8850        ):
 8851
 8852            # barcode annotation field
 8853            if not tag:
 8854                tag = "BCF"
 8855
 8856            # VCF infos tags
 8857            vcf_infos_tags = {
 8858                tag: "barcode family calculation",
 8859                f"{tag}S": "barcode family samples",
 8860            }
 8861
 8862            # Param
 8863            param = self.get_param()
 8864            log.debug(f"param={param}")
 8865
 8866            # Prefix
 8867            prefix = self.get_explode_infos_prefix()
 8868
 8869            # PED param
 8870            ped = (
 8871                param.get("calculation", {})
 8872                .get("calculations", {})
 8873                .get("BARCODEFAMILY", {})
 8874                .get("family_pedigree", None)
 8875            )
 8876            log.debug(f"ped={ped}")
 8877
 8878            # Load PED
 8879            if ped:
 8880
 8881                # Pedigree is a file
 8882                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8883                    log.debug("Pedigree is file")
 8884                    with open(full_path(ped)) as ped:
 8885                        ped = json.load(ped)
 8886
 8887                # Pedigree is a string
 8888                elif isinstance(ped, str):
 8889                    log.debug("Pedigree is str")
 8890                    try:
 8891                        ped = json.loads(ped)
 8892                        log.debug("Pedigree is json str")
 8893                    except ValueError as e:
 8894                        ped_samples = ped.split(",")
 8895                        ped = {}
 8896                        for ped_sample in ped_samples:
 8897                            ped[ped_sample] = ped_sample
 8898
 8899                # Pedigree is a dict
 8900                elif isinstance(ped, dict):
 8901                    log.debug("Pedigree is dict")
 8902
 8903                # Pedigree is not well formatted
 8904                else:
 8905                    msg_error = "Pedigree not well formatted"
 8906                    log.error(msg_error)
 8907                    raise ValueError(msg_error)
 8908
 8909                # Construct list
 8910                ped_samples = list(ped.values())
 8911
 8912            else:
 8913                log.debug("Pedigree not defined. Take all samples")
 8914                ped_samples = self.get_header_sample_list()
 8915                ped = {}
 8916                for ped_sample in ped_samples:
 8917                    ped[ped_sample] = ped_sample
 8918
 8919            # Check pedigree
 8920            if not ped or len(ped) == 0:
 8921                msg_error = f"Error in pedigree: samples {ped_samples}"
 8922                log.error(msg_error)
 8923                raise ValueError(msg_error)
 8924
 8925            # Log
 8926            log.info(
 8927                "Calculation 'BARCODEFAMILY' - Samples: "
 8928                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 8929            )
 8930            log.debug(f"ped_samples={ped_samples}")
 8931
 8932            # Field
 8933            barcode_infos = prefix + tag
 8934
 8935            # Variants table
 8936            table_variants = self.get_table_variants()
 8937
 8938            # Header
 8939            vcf_reader = self.get_header()
 8940
 8941            # Create variant id
 8942            variant_id_column = self.get_variant_id_column()
 8943            added_columns = [variant_id_column]
 8944
 8945            # variant_id, FORMAT and samples
 8946            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8947                ped_samples
 8948            )
 8949
 8950            # Create dataframe
 8951            dataframe_barcode = self.get_query_to_df(
 8952                f""" SELECT {samples_fields} FROM {table_variants} """
 8953            )
 8954
 8955            # Create barcode column
 8956            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8957                lambda row: barcode(row, samples=ped_samples), axis=1
 8958            )
 8959
 8960            # Add barcode family to header
 8961            # Add vaf_normalization to header
 8962            vcf_reader.formats[tag] = vcf.parser._Format(
 8963                id=tag,
 8964                num=".",
 8965                type="String",
 8966                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 8967                type_code=self.code_type_map.get("String"),
 8968            )
 8969            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 8970                id=f"{tag}S",
 8971                num=".",
 8972                type="String",
 8973                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 8974                type_code=self.code_type_map.get("String"),
 8975            )
 8976
 8977            # Update
 8978            # for sample in ped_samples:
 8979            sql_update_set = []
 8980            for sample in self.get_header_sample_list() + ["FORMAT"]:
 8981                if sample in ped_samples:
 8982                    value = f'dataframe_barcode."{barcode_infos}"'
 8983                    value_samples = "'" + ",".join(ped_samples) + "'"
 8984                elif sample == "FORMAT":
 8985                    value = f"'{tag}'"
 8986                    value_samples = f"'{tag}S'"
 8987                else:
 8988                    value = "'.'"
 8989                    value_samples = "'.'"
 8990                format_regex = r"[a-zA-Z0-9\s]"
 8991                sql_update_set.append(
 8992                    f"""
 8993                        "{sample}" = 
 8994                        concat(
 8995                            CASE
 8996                                WHEN {table_variants}."{sample}" = './.'
 8997                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 8998                                ELSE {table_variants}."{sample}"
 8999                            END,
 9000                            ':',
 9001                            {value},
 9002                            ':',
 9003                            {value_samples}
 9004                        )
 9005                    """
 9006                )
 9007
 9008            sql_update_set_join = ", ".join(sql_update_set)
 9009            sql_update = f"""
 9010                UPDATE {table_variants}
 9011                SET {sql_update_set_join}
 9012                FROM dataframe_barcode
 9013                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9014            """
 9015            self.conn.execute(sql_update)
 9016
 9017            # Remove added columns
 9018            for added_column in added_columns:
 9019                self.drop_column(column=added_column)
 9020
 9021            # Delete dataframe
 9022            del dataframe_barcode
 9023            gc.collect()
 9024
 9025    def calculation_trio(self) -> None:
 9026        """
 9027        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9028        information to the INFO field of each variant.
 9029        """
 9030
 9031        # if FORMAT and samples
 9032        if (
 9033            "FORMAT" in self.get_header_columns_as_list()
 9034            and self.get_header_sample_list()
 9035        ):
 9036
 9037            # trio annotation field
 9038            trio_tag = "trio"
 9039
 9040            # VCF infos tags
 9041            vcf_infos_tags = {
 9042                "trio": "trio calculation",
 9043            }
 9044
 9045            # Param
 9046            param = self.get_param()
 9047
 9048            # Prefix
 9049            prefix = self.get_explode_infos_prefix()
 9050
 9051            # Trio param
 9052            trio_ped = (
 9053                param.get("calculation", {})
 9054                .get("calculations", {})
 9055                .get("TRIO", {})
 9056                .get("trio_pedigree", None)
 9057            )
 9058
 9059            # Load trio
 9060            if trio_ped:
 9061
 9062                # Trio pedigree is a file
 9063                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9064                    log.debug("TRIO pedigree is file")
 9065                    with open(full_path(trio_ped)) as trio_ped:
 9066                        trio_ped = json.load(trio_ped)
 9067
 9068                # Trio pedigree is a string
 9069                elif isinstance(trio_ped, str):
 9070                    log.debug("TRIO pedigree is str")
 9071                    try:
 9072                        trio_ped = json.loads(trio_ped)
 9073                        log.debug("TRIO pedigree is json str")
 9074                    except ValueError as e:
 9075                        trio_samples = trio_ped.split(",")
 9076                        if len(trio_samples) == 3:
 9077                            trio_ped = {
 9078                                "father": trio_samples[0],
 9079                                "mother": trio_samples[1],
 9080                                "child": trio_samples[2],
 9081                            }
 9082                            log.debug("TRIO pedigree is list str")
 9083                        else:
 9084                            msg_error = "TRIO pedigree not well formatted"
 9085                            log.error(msg_error)
 9086                            raise ValueError(msg_error)
 9087
 9088                # Trio pedigree is a dict
 9089                elif isinstance(trio_ped, dict):
 9090                    log.debug("TRIO pedigree is dict")
 9091
 9092                # Trio pedigree is not well formatted
 9093                else:
 9094                    msg_error = "TRIO pedigree not well formatted"
 9095                    log.error(msg_error)
 9096                    raise ValueError(msg_error)
 9097
 9098                # Construct trio list
 9099                trio_samples = [
 9100                    trio_ped.get("father", ""),
 9101                    trio_ped.get("mother", ""),
 9102                    trio_ped.get("child", ""),
 9103                ]
 9104
 9105            else:
 9106                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9107                samples_list = self.get_header_sample_list()
 9108                if len(samples_list) >= 3:
 9109                    trio_samples = self.get_header_sample_list()[0:3]
 9110                    trio_ped = {
 9111                        "father": trio_samples[0],
 9112                        "mother": trio_samples[1],
 9113                        "child": trio_samples[2],
 9114                    }
 9115                else:
 9116                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9117                    log.error(msg_error)
 9118                    raise ValueError(msg_error)
 9119
 9120            # Check trio pedigree
 9121            if not trio_ped or len(trio_ped) != 3:
 9122                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9123                log.error(msg_error)
 9124                raise ValueError(msg_error)
 9125
 9126            # Log
 9127            log.info(
 9128                f"Calculation 'TRIO' - Samples: "
 9129                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9130            )
 9131
 9132            # Field
 9133            trio_infos = prefix + trio_tag
 9134
 9135            # Variants table
 9136            table_variants = self.get_table_variants()
 9137
 9138            # Header
 9139            vcf_reader = self.get_header()
 9140
 9141            # Create variant id
 9142            variant_id_column = self.get_variant_id_column()
 9143            added_columns = [variant_id_column]
 9144
 9145            # variant_id, FORMAT and samples
 9146            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9147                self.get_header_sample_list()
 9148            )
 9149
 9150            # Create dataframe
 9151            dataframe_trio = self.get_query_to_df(
 9152                f""" SELECT {samples_fields} FROM {table_variants} """
 9153            )
 9154
 9155            # Create trio column
 9156            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9157                lambda row: trio(row, samples=trio_samples), axis=1
 9158            )
 9159
 9160            # Add trio to header
 9161            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9162                trio_tag,
 9163                ".",
 9164                "String",
 9165                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9166                "howard calculation",
 9167                "0",
 9168                self.code_type_map.get("String"),
 9169            )
 9170
 9171            # Update
 9172            sql_update = f"""
 9173                UPDATE {table_variants}
 9174                SET "INFO" = 
 9175                    concat(
 9176                        CASE
 9177                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9178                            THEN ''
 9179                            ELSE concat("INFO", ';')
 9180                        END,
 9181                        CASE
 9182                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9183                             AND dataframe_trio."{trio_infos}" NOT NULL
 9184                            THEN concat(
 9185                                    '{trio_tag}=',
 9186                                    dataframe_trio."{trio_infos}"
 9187                                )
 9188                            ELSE ''
 9189                        END
 9190                    )
 9191                FROM dataframe_trio
 9192                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9193            """
 9194            self.conn.execute(sql_update)
 9195
 9196            # Remove added columns
 9197            for added_column in added_columns:
 9198                self.drop_column(column=added_column)
 9199
 9200            # Delete dataframe
 9201            del dataframe_trio
 9202            gc.collect()
 9203
 9204    def calculation_vaf_normalization(self) -> None:
 9205        """
 9206        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9207        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9208        :return: The function does not return anything.
 9209        """
 9210
 9211        # if FORMAT and samples
 9212        if (
 9213            "FORMAT" in self.get_header_columns_as_list()
 9214            and self.get_header_sample_list()
 9215        ):
 9216
 9217            # vaf_normalization annotation field
 9218            vaf_normalization_tag = "VAF"
 9219
 9220            # VCF infos tags
 9221            vcf_infos_tags = {
 9222                "VAF": "VAF Variant Frequency",
 9223            }
 9224
 9225            # Prefix
 9226            prefix = self.get_explode_infos_prefix()
 9227
 9228            # Variants table
 9229            table_variants = self.get_table_variants()
 9230
 9231            # Header
 9232            vcf_reader = self.get_header()
 9233
 9234            # Do not calculate if VAF already exists
 9235            if "VAF" in vcf_reader.formats:
 9236                log.debug("VAF already on genotypes")
 9237                return
 9238
 9239            # Create variant id
 9240            variant_id_column = self.get_variant_id_column()
 9241            added_columns = [variant_id_column]
 9242
 9243            # variant_id, FORMAT and samples
 9244            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9245                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9246            )
 9247
 9248            # Create dataframe
 9249            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9250            log.debug(f"query={query}")
 9251            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9252
 9253            vaf_normalization_set = []
 9254
 9255            # for each sample vaf_normalization
 9256            for sample in self.get_header_sample_list():
 9257                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9258                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9259                )
 9260                vaf_normalization_set.append(
 9261                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9262                )
 9263
 9264            # Add VAF to FORMAT
 9265            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9266                "FORMAT"
 9267            ].apply(lambda x: str(x) + ":VAF")
 9268            vaf_normalization_set.append(
 9269                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9270            )
 9271
 9272            # Add vaf_normalization to header
 9273            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9274                id=vaf_normalization_tag,
 9275                num="1",
 9276                type="Float",
 9277                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9278                type_code=self.code_type_map.get("Float"),
 9279            )
 9280
 9281            # Create fields to add in INFO
 9282            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9283
 9284            # Update
 9285            sql_update = f"""
 9286                UPDATE {table_variants}
 9287                SET {sql_vaf_normalization_set}
 9288                FROM dataframe_vaf_normalization
 9289                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9290
 9291            """
 9292            self.conn.execute(sql_update)
 9293
 9294            # Remove added columns
 9295            for added_column in added_columns:
 9296                self.drop_column(column=added_column)
 9297
 9298            # Delete dataframe
 9299            del dataframe_vaf_normalization
 9300            gc.collect()
 9301
 9302    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9303        """
 9304        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9305        field in a VCF file and updates the INFO column of the variants table with the calculated
 9306        statistics.
 9307
 9308        :param info: The `info` parameter is a string that represents the type of information for which
 9309        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9310        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9311        maximum value, the mean, the median, defaults to VAF
 9312        :type info: str (optional)
 9313        """
 9314
 9315        # if FORMAT and samples
 9316        if (
 9317            "FORMAT" in self.get_header_columns_as_list()
 9318            and self.get_header_sample_list()
 9319        ):
 9320
 9321            # vaf_stats annotation field
 9322            vaf_stats_tag = info + "_stats"
 9323
 9324            # VCF infos tags
 9325            vcf_infos_tags = {
 9326                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9327                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9328                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9329                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9330                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9331                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9332                info
 9333                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9334            }
 9335
 9336            # Prefix
 9337            prefix = self.get_explode_infos_prefix()
 9338
 9339            # Field
 9340            vaf_stats_infos = prefix + vaf_stats_tag
 9341
 9342            # Variants table
 9343            table_variants = self.get_table_variants()
 9344
 9345            # Header
 9346            vcf_reader = self.get_header()
 9347
 9348            # Create variant id
 9349            variant_id_column = self.get_variant_id_column()
 9350            added_columns = [variant_id_column]
 9351
 9352            # variant_id, FORMAT and samples
 9353            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9354                self.get_header_sample_list()
 9355            )
 9356
 9357            # Create dataframe
 9358            dataframe_vaf_stats = self.get_query_to_df(
 9359                f""" SELECT {samples_fields} FROM {table_variants} """
 9360            )
 9361
 9362            # Create vaf_stats column
 9363            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9364                lambda row: genotype_stats(
 9365                    row, samples=self.get_header_sample_list(), info=info
 9366                ),
 9367                axis=1,
 9368            )
 9369
 9370            # List of vcf tags
 9371            sql_vaf_stats_fields = []
 9372
 9373            # Check all VAF stats infos
 9374            for stat in vcf_infos_tags:
 9375
 9376                # Extract stats
 9377                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9378                    lambda x: dict(x).get(stat, "")
 9379                )
 9380
 9381                # Add snpeff_hgvs to header
 9382                vcf_reader.infos[stat] = vcf.parser._Info(
 9383                    stat,
 9384                    ".",
 9385                    "String",
 9386                    vcf_infos_tags.get(stat, "genotype statistics"),
 9387                    "howard calculation",
 9388                    "0",
 9389                    self.code_type_map.get("String"),
 9390                )
 9391
 9392                if len(sql_vaf_stats_fields):
 9393                    sep = ";"
 9394                else:
 9395                    sep = ""
 9396
 9397                # Create fields to add in INFO
 9398                sql_vaf_stats_fields.append(
 9399                    f"""
 9400                        CASE
 9401                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9402                            THEN concat(
 9403                                    '{sep}{stat}=',
 9404                                    dataframe_vaf_stats."{stat}"
 9405                                )
 9406                            ELSE ''
 9407                        END
 9408                    """
 9409                )
 9410
 9411            # SQL set for update
 9412            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9413
 9414            # Update
 9415            sql_update = f"""
 9416                UPDATE {table_variants}
 9417                SET "INFO" = 
 9418                    concat(
 9419                        CASE
 9420                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9421                            THEN ''
 9422                            ELSE concat("INFO", ';')
 9423                        END,
 9424                        {sql_vaf_stats_fields_set}
 9425                    )
 9426                FROM dataframe_vaf_stats
 9427                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9428
 9429            """
 9430            self.conn.execute(sql_update)
 9431
 9432            # Remove added columns
 9433            for added_column in added_columns:
 9434                self.drop_column(column=added_column)
 9435
 9436            # Delete dataframe
 9437            del dataframe_vaf_stats
 9438            gc.collect()
 9439
 9440    def calculation_transcripts_annotation(
 9441        self, info_json: str = None, info_format: str = None
 9442    ) -> None:
 9443        """
 9444        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9445        field to it if transcripts are available.
 9446
 9447        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9448        is a string parameter that represents the information field to be used in the transcripts JSON.
 9449        It is used to specify the JSON format for the transcripts information. If no value is provided
 9450        when calling the method, it defaults to "
 9451        :type info_json: str
 9452        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9453        method is a string parameter that specifies the format of the information field to be used in
 9454        the transcripts JSON. It is used to define the format of the information field
 9455        :type info_format: str
 9456        """
 9457
 9458        # Create transcripts table
 9459        transcripts_table = self.create_transcript_view()
 9460
 9461        # Add info field
 9462        if transcripts_table:
 9463            self.transcript_view_to_variants(
 9464                transcripts_table=transcripts_table,
 9465                transcripts_info_field_json=info_json,
 9466                transcripts_info_field_format=info_format,
 9467            )
 9468        else:
 9469            log.info("No Transcripts to process. Check param.json file configuration")
 9470
 9471    def calculation_transcripts_prioritization(self) -> None:
 9472        """
 9473        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9474        prioritizes transcripts based on certain criteria.
 9475        """
 9476
 9477        # Create transcripts table
 9478        transcripts_table = self.create_transcript_view()
 9479
 9480        # Add info field
 9481        if transcripts_table:
 9482            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9483        else:
 9484            log.info("No Transcripts to process. Check param.json file configuration")
 9485
 9486    ###############
 9487    # Transcripts #
 9488    ###############
 9489
 9490    def transcripts_prioritization(
 9491        self, transcripts_table: str = None, param: dict = {}
 9492    ) -> bool:
 9493        """
 9494        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9495        and updates the variants table with the prioritized information.
 9496
 9497        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9498        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9499        This parameter is used to identify the table where the transcripts data is stored for the
 9500        prioritization process
 9501        :type transcripts_table: str
 9502        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9503        that contains various configuration settings for the prioritization process of transcripts. It
 9504        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9505        the prefix for prioritization fields, default profiles, and other
 9506        :type param: dict
 9507        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9508        transcripts prioritization process is successfully completed, and `False` if there are any
 9509        issues or if no profile is defined for transcripts prioritization.
 9510        """
 9511
 9512        log.debug("Start transcripts prioritization...")
 9513
 9514        # Param
 9515        if not param:
 9516            param = self.get_param()
 9517
 9518        # Variants table
 9519        table_variants = self.get_table_variants()
 9520        log.debug(f"transcripts_table={transcripts_table}")
 9521        # Transcripts table
 9522        if transcripts_table is None:
 9523            log.debug(f"transcripts_table={transcripts_table}")
 9524            transcripts_table = self.create_transcript_view(
 9525                transcripts_table="transcripts", param=param
 9526            )
 9527            log.debug(f"transcripts_table={transcripts_table}")
 9528        if transcripts_table is None:
 9529            msg_err = "No Transcripts table availalble"
 9530            log.error(msg_err)
 9531            raise ValueError(msg_err)
 9532
 9533        # Get transcripts columns
 9534        columns_as_list_query = f"""
 9535            DESCRIBE {transcripts_table}
 9536        """
 9537        columns_as_list = list(
 9538            self.get_query_to_df(columns_as_list_query)["column_name"]
 9539        )
 9540
 9541        # Create INFO if not exists
 9542        if "INFO" not in columns_as_list:
 9543            query_add_info = f"""
 9544                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9545            """
 9546            self.execute_query(query_add_info)
 9547
 9548        # Prioritization param and Force only PZ Score and Flag
 9549        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9550        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9551        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9552        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9553        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9554        pz_profile_default = (
 9555            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9556        )
 9557
 9558        # Exit if no profile
 9559        if pz_profile_default is None:
 9560            log.warning("No profile defined for transcripts prioritization")
 9561            return False
 9562
 9563        # Prioritization
 9564        prioritization_result = self.prioritization(
 9565            table=transcripts_table,
 9566            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9567        )
 9568        if not prioritization_result:
 9569            log.warning("Transcripts prioritization not processed")
 9570            return False
 9571
 9572        # Explode PZ fields
 9573        self.explode_infos(
 9574            table=transcripts_table,
 9575            fields=param.get("transcripts", {})
 9576            .get("prioritization", {})
 9577            .get("pzfields", []),
 9578        )
 9579
 9580        # Export Transcripts prioritization infos to variants table
 9581        query_update = f"""
 9582            WITH RankedTranscripts AS (
 9583                SELECT
 9584                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9585                    ROW_NUMBER() OVER (
 9586                        PARTITION BY "#CHROM", POS, REF, ALT
 9587                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9588                    ) AS rn
 9589                FROM
 9590                    {transcripts_table}
 9591            )
 9592            UPDATE {table_variants}
 9593                SET
 9594                INFO = CONCAT(CASE
 9595                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9596                            THEN ''
 9597                            ELSE concat("INFO", ';')
 9598                        END,
 9599                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9600                        )
 9601            FROM
 9602                RankedTranscripts
 9603            WHERE
 9604                rn = 1
 9605                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9606                AND variants."POS" = RankedTranscripts."POS"
 9607                AND variants."REF" = RankedTranscripts."REF"
 9608                AND variants."ALT" = RankedTranscripts."ALT"
 9609                
 9610        """
 9611        self.execute_query(query=query_update)
 9612
 9613        # Add PZ Transcript in header
 9614        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9615            pz_fields_transcripts,
 9616            ".",
 9617            "String",
 9618            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9619            "unknown",
 9620            "unknown",
 9621            code_type_map["String"],
 9622        )
 9623
 9624        # Return
 9625        return True
 9626
 9627    def create_transcript_view_from_columns_map(
 9628        self,
 9629        transcripts_table: str = "transcripts",
 9630        columns_maps: dict = {},
 9631        added_columns: list = [],
 9632        temporary_tables: list = None,
 9633        annotation_fields: list = None,
 9634    ) -> tuple[list, list, list]:
 9635        """
 9636        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9637        specified columns mapping for transcripts data.
 9638
 9639        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9640        the table where the transcripts data is stored or will be stored in the database. This table
 9641        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9642        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9643        :type transcripts_table: str (optional)
 9644        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9645        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9646        represents a mapping configuration for a specific set of columns. It typically includes details such
 9647        as the main transcript column and additional information columns
 9648        :type columns_maps: dict
 9649        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9650        function is a list that stores the additional columns that will be added to the view being created
 9651        based on the columns map provided. These columns are generated by exploding the transcript
 9652        information columns along with the main transcript column
 9653        :type added_columns: list
 9654        :param temporary_tables: The `temporary_tables` parameter in the
 9655        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9656        tables created during the process of creating a transcript view from a columns map. These temporary
 9657        tables are used to store intermediate results or transformations before the final view is generated
 9658        :type temporary_tables: list
 9659        :param annotation_fields: The `annotation_fields` parameter in the
 9660        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9661        for annotation in the query view creation process. These fields are extracted from the
 9662        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9663        :type annotation_fields: list
 9664        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9665        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9666        """
 9667
 9668        log.debug("Start transcrpts view creation from columns map...")
 9669
 9670        # "from_columns_map": [
 9671        #     {
 9672        #         "transcripts_column": "Ensembl_transcriptid",
 9673        #         "transcripts_infos_columns": [
 9674        #             "genename",
 9675        #             "Ensembl_geneid",
 9676        #             "LIST_S2_score",
 9677        #             "LIST_S2_pred",
 9678        #         ],
 9679        #     },
 9680        #     {
 9681        #         "transcripts_column": "Ensembl_transcriptid",
 9682        #         "transcripts_infos_columns": [
 9683        #             "genename",
 9684        #             "VARITY_R_score",
 9685        #             "Aloft_pred",
 9686        #         ],
 9687        #     },
 9688        # ],
 9689
 9690        # Init
 9691        if temporary_tables is None:
 9692            temporary_tables = []
 9693        if annotation_fields is None:
 9694            annotation_fields = []
 9695
 9696        # Variants table
 9697        table_variants = self.get_table_variants()
 9698
 9699        for columns_map in columns_maps:
 9700
 9701            # Transcript column
 9702            transcripts_column = columns_map.get("transcripts_column", None)
 9703
 9704            # Transcripts infos columns
 9705            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9706
 9707            if transcripts_column is not None:
 9708
 9709                # Explode
 9710                added_columns += self.explode_infos(
 9711                    fields=[transcripts_column] + transcripts_infos_columns
 9712                )
 9713
 9714                # View clauses
 9715                clause_select = []
 9716                for field in [transcripts_column] + transcripts_infos_columns:
 9717                    clause_select.append(
 9718                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9719                    )
 9720                    if field not in [transcripts_column]:
 9721                        annotation_fields.append(field)
 9722
 9723                # Querey View
 9724                query = f""" 
 9725                    SELECT
 9726                        "#CHROM", POS, REF, ALT, INFO,
 9727                        "{transcripts_column}" AS 'transcript',
 9728                        {", ".join(clause_select)}
 9729                    FROM (
 9730                        SELECT 
 9731                            "#CHROM", POS, REF, ALT, INFO,
 9732                            {", ".join(clause_select)}
 9733                        FROM {table_variants}
 9734                        )
 9735                    WHERE "{transcripts_column}" IS NOT NULL
 9736                """
 9737
 9738                # Create temporary table
 9739                temporary_table = transcripts_table + "".join(
 9740                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9741                )
 9742
 9743                # Temporary_tables
 9744                temporary_tables.append(temporary_table)
 9745                query_view = f"""
 9746                    CREATE TEMPORARY TABLE {temporary_table}
 9747                    AS ({query})
 9748                """
 9749                self.execute_query(query=query_view)
 9750
 9751        return added_columns, temporary_tables, annotation_fields
 9752
 9753    def create_transcript_view_from_column_format(
 9754        self,
 9755        transcripts_table: str = "transcripts",
 9756        column_formats: dict = {},
 9757        temporary_tables: list = None,
 9758        annotation_fields: list = None,
 9759    ) -> tuple[list, list, list]:
 9760        """
 9761        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9762        specified column formats, adds additional columns and annotation fields, and returns the list of
 9763        temporary tables and annotation fields.
 9764
 9765        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9766        the table containing the transcripts data. This table will be used as the base table for creating
 9767        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9768        different table name if needed, defaults to transcripts
 9769        :type transcripts_table: str (optional)
 9770        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9771        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9772        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9773        the provided code snippet:
 9774        :type column_formats: dict
 9775        :param temporary_tables: The `temporary_tables` parameter in the
 9776        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9777        views created during the process of creating a transcript view from a column format. These temporary
 9778        views are used to manipulate and extract data before generating the final transcript view. It
 9779        :type temporary_tables: list
 9780        :param annotation_fields: The `annotation_fields` parameter in the
 9781        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9782        that are extracted from the temporary views created during the process. These annotation fields are
 9783        obtained by querying the temporary views and extracting the column names excluding specific columns
 9784        like `#CH
 9785        :type annotation_fields: list
 9786        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9787        `temporary_tables` and `annotation_fields`.
 9788        """
 9789
 9790        log.debug("Start transcrpts view creation from column format...")
 9791
 9792        #  "from_column_format": [
 9793        #     {
 9794        #         "transcripts_column": "ANN",
 9795        #         "transcripts_infos_column": "Feature_ID",
 9796        #     }
 9797        # ],
 9798
 9799        # Init
 9800        if temporary_tables is None:
 9801            temporary_tables = []
 9802        if annotation_fields is None:
 9803            annotation_fields = []
 9804
 9805        for column_format in column_formats:
 9806
 9807            # annotation field and transcript annotation field
 9808            annotation_field = column_format.get("transcripts_column", "ANN")
 9809            transcript_annotation = column_format.get(
 9810                "transcripts_infos_column", "Feature_ID"
 9811            )
 9812
 9813            # Temporary View name
 9814            temporary_view_name = transcripts_table + "".join(
 9815                random.choices(string.ascii_uppercase + string.digits, k=10)
 9816            )
 9817
 9818            # Create temporary view name
 9819            temporary_view_name = self.annotation_format_to_table(
 9820                uniquify=True,
 9821                annotation_field=annotation_field,
 9822                view_name=temporary_view_name,
 9823                annotation_id=transcript_annotation,
 9824            )
 9825
 9826            # Annotation fields
 9827            if temporary_view_name:
 9828                query_annotation_fields = f"""
 9829                    SELECT *
 9830                    FROM (
 9831                        DESCRIBE SELECT *
 9832                        FROM {temporary_view_name}
 9833                        )
 9834                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9835                """
 9836                df_annotation_fields = self.get_query_to_df(
 9837                    query=query_annotation_fields
 9838                )
 9839
 9840                # Add temporary view and annotation fields
 9841                temporary_tables.append(temporary_view_name)
 9842                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9843
 9844        return temporary_tables, annotation_fields
 9845
 9846    def create_transcript_view(
 9847        self,
 9848        transcripts_table: str = None,
 9849        transcripts_table_drop: bool = True,
 9850        param: dict = {},
 9851    ) -> str:
 9852        """
 9853        The `create_transcript_view` function generates a transcript view by processing data from a
 9854        specified table based on provided parameters and structural information.
 9855
 9856        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9857        is used to specify the name of the table that will store the final transcript view data. If a table
 9858        name is not provided, the function will create a new table to store the transcript view data, and by
 9859        default,, defaults to transcripts
 9860        :type transcripts_table: str (optional)
 9861        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9862        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9863        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9864        the function will drop the existing transcripts table if it exists, defaults to True
 9865        :type transcripts_table_drop: bool (optional)
 9866        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9867        contains information needed to create a transcript view. It includes details such as the structure
 9868        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9869        the view. This parameter allows for flexibility and customization
 9870        :type param: dict
 9871        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9872        created or modified during the execution of the function.
 9873        """
 9874
 9875        log.debug("Start transcripts view creation...")
 9876
 9877        # Default
 9878        transcripts_table_default = "transcripts"
 9879
 9880        # Param
 9881        if not param:
 9882            param = self.get_param()
 9883
 9884        # Struct
 9885        struct = param.get("transcripts", {}).get("struct", None)
 9886
 9887        if struct:
 9888
 9889            # Transcripts table
 9890            if transcripts_table is None:
 9891                transcripts_table = param.get("transcripts", {}).get(
 9892                    "table", transcripts_table_default
 9893                )
 9894
 9895            # added_columns
 9896            added_columns = []
 9897
 9898            # Temporary tables
 9899            temporary_tables = []
 9900
 9901            # Annotation fields
 9902            annotation_fields = []
 9903
 9904            # from columns map
 9905            columns_maps = struct.get("from_columns_map", [])
 9906            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
 9907                self.create_transcript_view_from_columns_map(
 9908                    transcripts_table=transcripts_table,
 9909                    columns_maps=columns_maps,
 9910                    added_columns=added_columns,
 9911                    temporary_tables=temporary_tables,
 9912                    annotation_fields=annotation_fields,
 9913                )
 9914            )
 9915            added_columns += added_columns_tmp
 9916            temporary_tables += temporary_tables_tmp
 9917            annotation_fields += annotation_fields_tmp
 9918
 9919            # from column format
 9920            column_formats = struct.get("from_column_format", [])
 9921            temporary_tables_tmp, annotation_fields_tmp = (
 9922                self.create_transcript_view_from_column_format(
 9923                    transcripts_table=transcripts_table,
 9924                    column_formats=column_formats,
 9925                    temporary_tables=temporary_tables,
 9926                    annotation_fields=annotation_fields,
 9927                )
 9928            )
 9929            temporary_tables += temporary_tables_tmp
 9930            annotation_fields += annotation_fields_tmp
 9931
 9932            # Merge temporary tables query
 9933            query_merge = ""
 9934            for temporary_table in temporary_tables:
 9935
 9936                # First temporary table
 9937                if not query_merge:
 9938                    query_merge = f"""
 9939                        SELECT * FROM {temporary_table}
 9940                    """
 9941                # other temporary table (using UNION)
 9942                else:
 9943                    query_merge += f"""
 9944                        UNION BY NAME SELECT * FROM {temporary_table}
 9945                    """
 9946
 9947            # Merge on transcript
 9948            query_merge_on_transcripts_annotation_fields = []
 9949            # Aggregate all annotations fields
 9950            for annotation_field in set(annotation_fields):
 9951                query_merge_on_transcripts_annotation_fields.append(
 9952                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
 9953                )
 9954            # Query for transcripts view
 9955            query_merge_on_transcripts = f"""
 9956                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
 9957                FROM ({query_merge})
 9958                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
 9959            """
 9960
 9961            # Drop transcript view is necessary
 9962            if transcripts_table_drop:
 9963                query_drop = f"""
 9964                    DROP TABLE IF EXISTS {transcripts_table};
 9965                """
 9966                self.execute_query(query=query_drop)
 9967
 9968            # Merge and create transcript view
 9969            query_create_view = f"""
 9970                CREATE TABLE IF NOT EXISTS {transcripts_table}
 9971                AS {query_merge_on_transcripts}
 9972            """
 9973            self.execute_query(query=query_create_view)
 9974
 9975            # Remove added columns
 9976            for added_column in added_columns:
 9977                self.drop_column(column=added_column)
 9978
 9979        else:
 9980
 9981            transcripts_table = None
 9982
 9983        return transcripts_table
 9984
 9985    def annotation_format_to_table(
 9986        self,
 9987        uniquify: bool = True,
 9988        annotation_field: str = "ANN",
 9989        annotation_id: str = "Feature_ID",
 9990        view_name: str = "transcripts",
 9991    ) -> str:
 9992        """
 9993        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9994        table format.
 9995
 9996        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9997        values in the output or not. If set to `True`, the function will make sure that the output values
 9998        are unique, defaults to True
 9999        :type uniquify: bool (optional)
10000        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10001        contains the annotation information for each variant. This field is used to extract the annotation
10002        details for further processing in the function, defaults to ANN
10003        :type annotation_field: str (optional)
10004        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10005        used to specify the identifier for the annotation feature. This identifier will be used as a column
10006        name in the resulting table or view that is created based on the annotation data. It helps in
10007        uniquely identifying each annotation entry in the, defaults to Feature_ID
10008        :type annotation_id: str (optional)
10009        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10010        specify the name of the temporary table that will be created to store the transformed annotation
10011        data. This table will hold the extracted information from the annotation field in a structured
10012        format for further processing or analysis, defaults to transcripts
10013        :type view_name: str (optional)
10014        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10015        is stored in the variable `view_name`.
10016        """
10017
10018        # Annotation field
10019        annotation_format = "annotation_explode"
10020
10021        # Transcript annotation
10022        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10023
10024        # Prefix
10025        prefix = self.get_explode_infos_prefix()
10026        if prefix:
10027            prefix = "INFO/"
10028
10029        # Annotation fields
10030        annotation_infos = prefix + annotation_field
10031        annotation_format_infos = prefix + annotation_format
10032
10033        # Variants table
10034        table_variants = self.get_table_variants()
10035
10036        # Header
10037        vcf_reader = self.get_header()
10038
10039        # Add columns
10040        added_columns = []
10041
10042        # Explode HGVS field in column
10043        added_columns += self.explode_infos(fields=[annotation_field])
10044
10045        if annotation_field in vcf_reader.infos:
10046
10047            # Extract ANN header
10048            ann_description = vcf_reader.infos[annotation_field].desc
10049            pattern = r"'(.+?)'"
10050            match = re.search(pattern, ann_description)
10051            if match:
10052                ann_header_match = match.group(1).split(" | ")
10053                ann_header = []
10054                ann_header_desc = {}
10055                for i in range(len(ann_header_match)):
10056                    ann_header_info = "".join(
10057                        char for char in ann_header_match[i] if char.isalnum()
10058                    )
10059                    ann_header.append(ann_header_info)
10060                    ann_header_desc[ann_header_info] = ann_header_match[i]
10061                if not ann_header_desc:
10062                    raise ValueError("Invalid header description format")
10063            else:
10064                raise ValueError("Invalid header description format")
10065
10066            # Create variant id
10067            variant_id_column = self.get_variant_id_column()
10068            added_columns += [variant_id_column]
10069
10070            # Create dataframe
10071            dataframe_annotation_format = self.get_query_to_df(
10072                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10073            )
10074
10075            # Create annotation columns
10076            dataframe_annotation_format[
10077                annotation_format_infos
10078            ] = dataframe_annotation_format[annotation_infos].apply(
10079                lambda x: explode_annotation_format(
10080                    annotation=str(x),
10081                    uniquify=uniquify,
10082                    output_format="JSON",
10083                    prefix="",
10084                    header=list(ann_header_desc.values()),
10085                )
10086            )
10087
10088            # Find keys
10089            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10090            df_keys = self.get_query_to_df(query=query_json)
10091
10092            # Check keys
10093            query_json_key = []
10094            for _, row in df_keys.iterrows():
10095
10096                # Key
10097                key = row.iloc[0]
10098
10099                # key_clean
10100                key_clean = "".join(char for char in key if char.isalnum())
10101
10102                # Type
10103                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10104
10105                # Get DataFrame from query
10106                df_json_type = self.get_query_to_df(query=query_json_type)
10107
10108                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10109                with pd.option_context("future.no_silent_downcasting", True):
10110                    df_json_type.fillna(value="", inplace=True)
10111                    replace_dict = {None: np.nan, "": np.nan}
10112                    df_json_type.replace(replace_dict, inplace=True)
10113                    df_json_type.dropna(inplace=True)
10114
10115                # Detect column type
10116                column_type = detect_column_type(df_json_type[key_clean])
10117
10118                # Append
10119                query_json_key.append(
10120                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10121                )
10122
10123            # Create view
10124            query_view = f"""
10125                CREATE TEMPORARY TABLE {view_name}
10126                AS (
10127                    SELECT *, {annotation_id} AS 'transcript'
10128                    FROM (
10129                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10130                        FROM dataframe_annotation_format
10131                        )
10132                    );
10133            """
10134            self.execute_query(query=query_view)
10135
10136        else:
10137
10138            # Return None
10139            view_name = None
10140
10141        # Remove added columns
10142        for added_column in added_columns:
10143            self.drop_column(column=added_column)
10144
10145        return view_name
10146
10147    def transcript_view_to_variants(
10148        self,
10149        transcripts_table: str = None,
10150        transcripts_column_id: str = None,
10151        transcripts_info_json: str = None,
10152        transcripts_info_field_json: str = None,
10153        transcripts_info_format: str = None,
10154        transcripts_info_field_format: str = None,
10155        param: dict = {},
10156    ) -> bool:
10157        """
10158        The `transcript_view_to_variants` function updates a variants table with information from
10159        transcripts in JSON format.
10160
10161        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10162        table containing the transcripts data. If this parameter is not provided, the function will
10163        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10164        :type transcripts_table: str
10165        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10166        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10167        identifier is used to match transcripts with variants in the database
10168        :type transcripts_column_id: str
10169        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10170        of the column in the variants table where the transcripts information will be stored in JSON
10171        format. This parameter allows you to define the column in the variants table that will hold the
10172        JSON-formatted information about transcripts
10173        :type transcripts_info_json: str
10174        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10175        specify the field in the VCF header that will contain information about transcripts in JSON
10176        format. This field will be added to the VCF header as an INFO field with the specified name
10177        :type transcripts_info_field_json: str
10178        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10179        format of the information about transcripts that will be stored in the variants table. This
10180        format can be used to define how the transcript information will be structured or displayed
10181        within the variants table
10182        :type transcripts_info_format: str
10183        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10184        specify the field in the VCF header that will contain information about transcripts in a
10185        specific format. This field will be added to the VCF header as an INFO field with the specified
10186        name
10187        :type transcripts_info_field_format: str
10188        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10189        that contains various configuration settings related to transcripts. It is used to provide
10190        default values for certain parameters if they are not explicitly provided when calling the
10191        method. The `param` dictionary can be passed as an argument
10192        :type param: dict
10193        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10194        if the operation is successful and `False` if certain conditions are not met.
10195        """
10196
10197        msg_info_prefix = "Start transcripts view to variants annotations"
10198
10199        log.debug(f"{msg_info_prefix}...")
10200
10201        # Default
10202        transcripts_table_default = "transcripts"
10203        transcripts_column_id_default = "transcript"
10204        transcripts_info_json_default = None
10205        transcripts_info_format_default = None
10206        transcripts_info_field_json_default = None
10207        transcripts_info_field_format_default = None
10208
10209        # Param
10210        if not param:
10211            param = self.get_param()
10212
10213        # Transcripts table
10214        if transcripts_table is None:
10215            transcripts_table = param.get("transcripts", {}).get(
10216                "table", transcripts_table_default
10217            )
10218
10219        # Transcripts column ID
10220        if transcripts_column_id is None:
10221            transcripts_column_id = param.get("transcripts", {}).get(
10222                "column_id", transcripts_column_id_default
10223            )
10224
10225        # Transcripts info json
10226        if transcripts_info_json is None:
10227            transcripts_info_json = param.get("transcripts", {}).get(
10228                "transcripts_info_json", transcripts_info_json_default
10229            )
10230
10231        # Transcripts info field JSON
10232        if transcripts_info_field_json is None:
10233            transcripts_info_field_json = param.get("transcripts", {}).get(
10234                "transcripts_info_field_json", transcripts_info_field_json_default
10235            )
10236        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10237        #     transcripts_info_json = transcripts_info_field_json
10238
10239        # Transcripts info format
10240        if transcripts_info_format is None:
10241            transcripts_info_format = param.get("transcripts", {}).get(
10242                "transcripts_info_format", transcripts_info_format_default
10243            )
10244
10245        # Transcripts info field FORMAT
10246        if transcripts_info_field_format is None:
10247            transcripts_info_field_format = param.get("transcripts", {}).get(
10248                "transcripts_info_field_format", transcripts_info_field_format_default
10249            )
10250        # if (
10251        #     transcripts_info_field_format is not None
10252        #     and transcripts_info_format is None
10253        # ):
10254        #     transcripts_info_format = transcripts_info_field_format
10255
10256        # Variants table
10257        table_variants = self.get_table_variants()
10258
10259        # Check info columns param
10260        if (
10261            transcripts_info_json is None
10262            and transcripts_info_field_json is None
10263            and transcripts_info_format is None
10264            and transcripts_info_field_format is None
10265        ):
10266            return False
10267
10268        # Transcripts infos columns
10269        query_transcripts_infos_columns = f"""
10270            SELECT *
10271            FROM (
10272                DESCRIBE SELECT * FROM {transcripts_table}
10273                )
10274            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10275        """
10276        transcripts_infos_columns = list(
10277            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10278        )
10279
10280        # View results
10281        clause_select = []
10282        clause_to_json = []
10283        clause_to_format = []
10284        for field in transcripts_infos_columns:
10285            clause_select.append(
10286                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10287            )
10288            clause_to_json.append(f""" '{field}': "{field}" """)
10289            clause_to_format.append(f""" "{field}" """)
10290
10291        # Update
10292        update_set_json = []
10293        update_set_format = []
10294
10295        # VCF header
10296        vcf_reader = self.get_header()
10297
10298        # Transcripts to info column in JSON
10299        if transcripts_info_json is not None:
10300
10301            # Create column on variants table
10302            self.add_column(
10303                table_name=table_variants,
10304                column_name=transcripts_info_json,
10305                column_type="JSON",
10306                default_value=None,
10307                drop=False,
10308            )
10309
10310            # Add header
10311            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10312                transcripts_info_json,
10313                ".",
10314                "String",
10315                "Transcripts in JSON format",
10316                "unknwon",
10317                "unknwon",
10318                self.code_type_map["String"],
10319            )
10320
10321            # Add to update
10322            update_set_json.append(
10323                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10324            )
10325
10326        # Transcripts to info field in JSON
10327        if transcripts_info_field_json is not None:
10328
10329            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10330
10331            # Add to update
10332            update_set_json.append(
10333                f""" 
10334                    INFO = concat(
10335                            CASE
10336                                WHEN INFO NOT IN ('', '.')
10337                                THEN INFO
10338                                ELSE ''
10339                            END,
10340                            CASE
10341                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10342                                THEN concat(
10343                                    ';{transcripts_info_field_json}=',
10344                                    t.{transcripts_info_json}
10345                                )
10346                                ELSE ''
10347                            END
10348                            )
10349                """
10350            )
10351
10352            # Add header
10353            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10354                transcripts_info_field_json,
10355                ".",
10356                "String",
10357                "Transcripts in JSON format",
10358                "unknwon",
10359                "unknwon",
10360                self.code_type_map["String"],
10361            )
10362
10363        if update_set_json:
10364
10365            # Update query
10366            query_update = f"""
10367                UPDATE {table_variants}
10368                    SET {", ".join(update_set_json)}
10369                FROM
10370                (
10371                    SELECT
10372                        "#CHROM", POS, REF, ALT,
10373                            concat(
10374                            '{{',
10375                            string_agg(
10376                                '"' || "{transcripts_column_id}" || '":' ||
10377                                to_json(json_output)
10378                            ),
10379                            '}}'
10380                            )::JSON AS {transcripts_info_json}
10381                    FROM
10382                        (
10383                        SELECT
10384                            "#CHROM", POS, REF, ALT,
10385                            "{transcripts_column_id}",
10386                            to_json(
10387                                {{{",".join(clause_to_json)}}}
10388                            )::JSON AS json_output
10389                        FROM
10390                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10391                        WHERE "{transcripts_column_id}" IS NOT NULL
10392                        )
10393                    GROUP BY "#CHROM", POS, REF, ALT
10394                ) AS t
10395                WHERE {table_variants}."#CHROM" = t."#CHROM"
10396                    AND {table_variants}."POS" = t."POS"
10397                    AND {table_variants}."REF" = t."REF"
10398                    AND {table_variants}."ALT" = t."ALT"
10399            """
10400
10401            self.execute_query(query=query_update)
10402
10403        # Transcripts to info column in FORMAT
10404        if transcripts_info_format is not None:
10405
10406            # Create column on variants table
10407            self.add_column(
10408                table_name=table_variants,
10409                column_name=transcripts_info_format,
10410                column_type="VARCHAR",
10411                default_value=None,
10412                drop=False,
10413            )
10414
10415            # Add header
10416            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10417                transcripts_info_format,
10418                ".",
10419                "String",
10420                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10421                "unknwon",
10422                "unknwon",
10423                self.code_type_map["String"],
10424            )
10425
10426            # Add to update
10427            update_set_format.append(
10428                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10429            )
10430
10431        # Transcripts to info field in JSON
10432        if transcripts_info_field_format is not None:
10433
10434            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10435
10436            # Add to update
10437            update_set_format.append(
10438                f""" 
10439                    INFO = concat(
10440                            CASE
10441                                WHEN INFO NOT IN ('', '.')
10442                                THEN INFO
10443                                ELSE ''
10444                            END,
10445                            CASE
10446                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10447                                THEN concat(
10448                                    ';{transcripts_info_field_format}=',
10449                                    t.{transcripts_info_format}
10450                                )
10451                                ELSE ''
10452                            END
10453                            )
10454                """
10455            )
10456
10457            # Add header
10458            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10459                transcripts_info_field_format,
10460                ".",
10461                "String",
10462                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10463                "unknwon",
10464                "unknwon",
10465                self.code_type_map["String"],
10466            )
10467
10468        if update_set_format:
10469
10470            # Update query
10471            query_update = f"""
10472                UPDATE {table_variants}
10473                    SET {", ".join(update_set_format)}
10474                FROM
10475                (
10476                    SELECT
10477                        "#CHROM", POS, REF, ALT,
10478                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10479                    FROM 
10480                        (
10481                        SELECT
10482                            "#CHROM", POS, REF, ALT,
10483                            "{transcripts_column_id}",
10484                            concat(
10485                                "{transcripts_column_id}",
10486                                '|',
10487                                {", '|', ".join(clause_to_format)}
10488                            ) AS {transcripts_info_format}
10489                        FROM
10490                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10491                        )
10492                    GROUP BY "#CHROM", POS, REF, ALT
10493                ) AS t
10494                WHERE {table_variants}."#CHROM" = t."#CHROM"
10495                    AND {table_variants}."POS" = t."POS"
10496                    AND {table_variants}."REF" = t."REF"
10497                    AND {table_variants}."ALT" = t."ALT"
10498            """
10499
10500            self.execute_query(query=query_update)
10501
10502        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Samples
78        self.set_samples()
79
80        # Load data
81        if load:
82            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 84    def set_samples(self, samples: list = None) -> list:
 85        """
 86        The function `set_samples` sets the samples attribute of an object to a provided list or
 87        retrieves it from a parameter dictionary.
 88
 89        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 90        input and sets the `samples` attribute of the class to the provided list. If no samples are
 91        provided, it tries to get the samples from the class's parameters using the `get_param` method
 92        :type samples: list
 93        :return: The `samples` list is being returned.
 94        """
 95
 96        if not samples:
 97            samples = self.get_param().get("samples", {}).get("list", None)
 98
 99        self.samples = samples
100
101        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
103    def get_samples(self) -> list:
104        """
105        This function returns a list of samples.
106        :return: The `get_samples` method is returning the `samples` attribute of the object.
107        """
108
109        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
111    def get_samples_check(self) -> bool:
112        """
113        This function returns the value of the "check" key within the "samples" dictionary retrieved
114        from the parameters.
115        :return: The method `get_samples_check` is returning the value of the key "check" inside the
116        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
117        method. If the key "check" is not found, it will return `False`.
118        """
119
120        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
122    def set_input(self, input: str = None) -> None:
123        """
124        The function `set_input` takes a file name as input, extracts the name and extension, and sets
125        attributes in the class accordingly.
126
127        :param input: The `set_input` method in the provided code snippet is used to set attributes
128        related to the input file. Here's a breakdown of the parameters and their usage in the method:
129        :type input: str
130        """
131
132        if input and not isinstance(input, str):
133            try:
134                self.input = input.name
135            except:
136                log.error(f"Input file '{input} in bad format")
137                raise ValueError(f"Input file '{input} in bad format")
138        else:
139            self.input = input
140
141        # Input format
142        if input:
143            input_name, input_extension = os.path.splitext(self.input)
144            self.input_name = input_name
145            self.input_extension = input_extension
146            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
148    def set_config(self, config: dict) -> None:
149        """
150        The set_config function takes a config object and assigns it as the configuration object for the
151        class.
152
153        :param config: The `config` parameter in the `set_config` function is a dictionary object that
154        contains configuration settings for the class. When you call the `set_config` function with a
155        dictionary object as the argument, it will set that dictionary as the configuration object for
156        the class
157        :type config: dict
158        """
159
160        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
162    def set_param(self, param: dict) -> None:
163        """
164        This function sets a parameter object for the class based on the input dictionary.
165
166        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
167        as the `param` attribute of the class instance
168        :type param: dict
169        """
170
171        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
173    def init_variables(self) -> None:
174        """
175        This function initializes the variables that will be used in the rest of the class
176        """
177
178        self.prefix = "howard"
179        self.table_variants = "variants"
180        self.dataframe = None
181
182        self.comparison_map = {
183            "gt": ">",
184            "gte": ">=",
185            "lt": "<",
186            "lte": "<=",
187            "equals": "=",
188            "contains": "SIMILAR TO",
189        }
190
191        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
192
193        self.code_type_map_to_sql = {
194            "Integer": "INTEGER",
195            "String": "VARCHAR",
196            "Float": "FLOAT",
197            "Flag": "VARCHAR",
198        }
199
200        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
202    def get_indexing(self) -> bool:
203        """
204        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
205        returns False.
206        :return: The value of the indexing parameter.
207        """
208
209        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
211    def get_connexion_config(self) -> dict:
212        """
213        The function `get_connexion_config` returns a dictionary containing the configuration for a
214        connection, including the number of threads and memory limit.
215        :return: a dictionary containing the configuration for the Connexion library.
216        """
217
218        # config
219        config = self.get_config()
220
221        # Connexion config
222        connexion_config = {}
223        threads = self.get_threads()
224
225        # Threads
226        if threads:
227            connexion_config["threads"] = threads
228
229        # Memory
230        # if config.get("memory", None):
231        #     connexion_config["memory_limit"] = config.get("memory")
232        if self.get_memory():
233            connexion_config["memory_limit"] = self.get_memory()
234
235        # Temporary directory
236        if config.get("tmp", None):
237            connexion_config["temp_directory"] = config.get("tmp")
238
239        # Access
240        if config.get("access", None):
241            access = config.get("access")
242            if access in ["RO"]:
243                access = "READ_ONLY"
244            elif access in ["RW"]:
245                access = "READ_WRITE"
246            connexion_db = self.get_connexion_db()
247            if connexion_db in ":memory:":
248                access = "READ_WRITE"
249            connexion_config["access_mode"] = access
250
251        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
253    def get_duckdb_settings(self) -> dict:
254        """
255        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
256        string.
257        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
258        """
259
260        # config
261        config = self.get_config()
262
263        # duckdb settings
264        duckdb_settings_dict = {}
265        if config.get("duckdb_settings", None):
266            duckdb_settings = config.get("duckdb_settings")
267            duckdb_settings = full_path(duckdb_settings)
268            # duckdb setting is a file
269            if os.path.exists(duckdb_settings):
270                with open(duckdb_settings) as json_file:
271                    duckdb_settings_dict = yaml.safe_load(json_file)
272            # duckdb settings is a string
273            else:
274                duckdb_settings_dict = json.loads(duckdb_settings)
275
276        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
278    def set_connexion_db(self) -> str:
279        """
280        The function `set_connexion_db` returns the appropriate database connection string based on the
281        input format and connection type.
282        :return: the value of the variable `connexion_db`.
283        """
284
285        # Default connexion db
286        default_connexion_db = ":memory:"
287
288        # Find connexion db
289        if self.get_input_format() in ["db", "duckdb"]:
290            connexion_db = self.get_input()
291        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
292            connexion_db = default_connexion_db
293        elif self.get_connexion_type() in ["tmpfile"]:
294            tmp_name = tempfile.mkdtemp(
295                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
296            )
297            connexion_db = f"{tmp_name}/tmp.db"
298        elif self.get_connexion_type() != "":
299            connexion_db = self.get_connexion_type()
300        else:
301            connexion_db = default_connexion_db
302
303        # Set connexion db
304        self.connexion_db = connexion_db
305
306        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
308    def set_connexion(self, conn) -> None:
309        """
310        The function `set_connexion` creates a connection to a database, with options for different
311        database formats and settings.
312
313        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
314        database. If a connection is not provided, a new connection to an in-memory database is created.
315        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
316        sqlite
317        """
318
319        # Connexion db
320        connexion_db = self.set_connexion_db()
321
322        # Connexion config
323        connexion_config = self.get_connexion_config()
324
325        # Connexion format
326        connexion_format = self.get_config().get("connexion_format", "duckdb")
327        # Set connexion format
328        self.connexion_format = connexion_format
329
330        # Connexion
331        if not conn:
332            if connexion_format in ["duckdb"]:
333                conn = duckdb.connect(connexion_db, config=connexion_config)
334                # duckDB settings
335                duckdb_settings = self.get_duckdb_settings()
336                if duckdb_settings:
337                    for setting in duckdb_settings:
338                        setting_value = duckdb_settings.get(setting)
339                        if isinstance(setting_value, str):
340                            setting_value = f"'{setting_value}'"
341                        conn.execute(f"PRAGMA {setting}={setting_value};")
342            elif connexion_format in ["sqlite"]:
343                conn = sqlite3.connect(connexion_db)
344
345        # Set connexion
346        self.conn = conn
347
348        # Log
349        log.debug(f"connexion_format: {connexion_format}")
350        log.debug(f"connexion_db: {connexion_db}")
351        log.debug(f"connexion config: {connexion_config}")
352        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
354    def set_output(self, output: str = None) -> None:
355        """
356        The `set_output` function in Python sets the output file based on the input or a specified key
357        in the config file, extracting the output name, extension, and format.
358
359        :param output: The `output` parameter in the `set_output` method is used to specify the name of
360        the output file. If the config file has an 'output' key, the method sets the output to the value
361        of that key. If no output is provided, it sets the output to `None`
362        :type output: str
363        """
364
365        if output and not isinstance(output, str):
366            self.output = output.name
367        else:
368            self.output = output
369
370        # Output format
371        if self.output:
372            output_name, output_extension = os.path.splitext(self.output)
373            self.output_name = output_name
374            self.output_extension = output_extension
375            self.output_format = self.output_extension.replace(".", "")
376        else:
377            self.output_name = None
378            self.output_extension = None
379            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
381    def set_header(self) -> None:
382        """
383        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
384        """
385
386        input_file = self.get_input()
387        default_header_list = [
388            "##fileformat=VCFv4.2",
389            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
390        ]
391
392        # Full path
393        input_file = full_path(input_file)
394
395        if input_file:
396
397            input_format = self.get_input_format()
398            input_compressed = self.get_input_compressed()
399            config = self.get_config()
400            header_list = default_header_list
401            if input_format in [
402                "vcf",
403                "hdr",
404                "tsv",
405                "csv",
406                "psv",
407                "parquet",
408                "db",
409                "duckdb",
410            ]:
411                # header provided in param
412                if config.get("header_file", None):
413                    with open(config.get("header_file"), "rt") as f:
414                        header_list = self.read_vcf_header(f)
415                # within a vcf file format (header within input file itsself)
416                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
417                    # within a compressed vcf file format (.vcf.gz)
418                    if input_compressed:
419                        with bgzf.open(input_file, "rt") as f:
420                            header_list = self.read_vcf_header(f)
421                    # within an uncompressed vcf file format (.vcf)
422                    else:
423                        with open(input_file, "rt") as f:
424                            header_list = self.read_vcf_header(f)
425                # header provided in default external file .hdr
426                elif os.path.exists((input_file + ".hdr")):
427                    with open(input_file + ".hdr", "rt") as f:
428                        header_list = self.read_vcf_header(f)
429                else:
430                    try:  # Try to get header info fields and file columns
431
432                        with tempfile.TemporaryDirectory() as tmpdir:
433
434                            # Create database
435                            db_for_header = Database(database=input_file)
436
437                            # Get header columns for infos fields
438                            db_header_from_columns = (
439                                db_for_header.get_header_from_columns()
440                            )
441
442                            # Get real columns in the file
443                            db_header_columns = db_for_header.get_columns()
444
445                            # Write header file
446                            header_file_tmp = os.path.join(tmpdir, "header")
447                            f = open(header_file_tmp, "w")
448                            vcf.Writer(f, db_header_from_columns)
449                            f.close()
450
451                            # Replace #CHROM line with rel columns
452                            header_list = db_for_header.read_header_file(
453                                header_file=header_file_tmp
454                            )
455                            header_list[-1] = "\t".join(db_header_columns)
456
457                    except:
458
459                        log.warning(
460                            f"No header for file {input_file}. Set as default VCF header"
461                        )
462                        header_list = default_header_list
463
464            else:  # try for unknown format ?
465
466                log.error(f"Input file format '{input_format}' not available")
467                raise ValueError(f"Input file format '{input_format}' not available")
468
469            if not header_list:
470                header_list = default_header_list
471
472            # header as list
473            self.header_list = header_list
474
475            # header as VCF object
476            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
477
478        else:
479
480            self.header_list = None
481            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
483    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
484        """
485        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
486        DataFrame based on the connection format.
487
488        :param query: The `query` parameter in the `get_query_to_df` function is a string that
489        represents the SQL query you want to execute. This query will be used to fetch data from a
490        database and convert it into a pandas DataFrame
491        :type query: str
492        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
493        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
494        function will only fetch up to that number of rows from the database query result. If no limit
495        is specified,
496        :type limit: int
497        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
498        """
499
500        # Connexion format
501        connexion_format = self.get_connexion_format()
502
503        # Limit in query
504        if limit:
505            pd.set_option("display.max_rows", limit)
506            if connexion_format in ["duckdb"]:
507                df = (
508                    self.conn.execute(query)
509                    .fetch_record_batch(limit)
510                    .read_next_batch()
511                    .to_pandas()
512                )
513            elif connexion_format in ["sqlite"]:
514                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
515
516        # Full query
517        else:
518            if connexion_format in ["duckdb"]:
519                df = self.conn.execute(query).df()
520            elif connexion_format in ["sqlite"]:
521                df = pd.read_sql_query(query, self.conn)
522
523        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
525    def get_overview(self) -> None:
526        """
527        The function prints the input, output, config, and dataframe of the current object
528        """
529        table_variants_from = self.get_table_variants(clause="from")
530        sql_columns = self.get_header_columns_as_sql()
531        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
532        df = self.get_query_to_df(sql_query_export)
533        log.info(
534            "Input:  "
535            + str(self.get_input())
536            + " ["
537            + str(str(self.get_input_format()))
538            + "]"
539        )
540        log.info(
541            "Output: "
542            + str(self.get_output())
543            + " ["
544            + str(str(self.get_output_format()))
545            + "]"
546        )
547        log.info("Config: ")
548        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
549            "\n"
550        ):
551            log.info("\t" + str(d))
552        log.info("Param: ")
553        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
554            "\n"
555        ):
556            log.info("\t" + str(d))
557        log.info("Sample list: " + str(self.get_header_sample_list()))
558        log.info("Dataframe: ")
559        for d in str(df).split("\n"):
560            log.info("\t" + str(d))
561
562        # garbage collector
563        del df
564        gc.collect()
565
566        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
568    def get_stats(self) -> dict:
569        """
570        The `get_stats` function calculates and returns various statistics of the current object,
571        including information about the input file, variants, samples, header fields, quality, and
572        SNVs/InDels.
573        :return: a dictionary containing various statistics of the current object. The dictionary has
574        the following structure:
575        """
576
577        # Log
578        log.info(f"Stats Calculation...")
579
580        # table varaints
581        table_variants_from = self.get_table_variants()
582
583        # stats dict
584        stats = {"Infos": {}}
585
586        ### File
587        input_file = self.get_input()
588        stats["Infos"]["Input file"] = input_file
589
590        # Header
591        header_infos = self.get_header().infos
592        header_formats = self.get_header().formats
593        header_infos_list = list(header_infos)
594        header_formats_list = list(header_formats)
595
596        ### Variants
597
598        stats["Variants"] = {}
599
600        # Variants by chr
601        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
602        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
603        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
604            by=["CHROM"], kind="quicksort"
605        )
606
607        # Total number of variants
608        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
609
610        # Calculate percentage
611        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
612            lambda x: (x / nb_of_variants)
613        )
614
615        stats["Variants"]["Number of variants by chromosome"] = (
616            nb_of_variants_by_chrom.to_dict(orient="index")
617        )
618
619        stats["Infos"]["Number of variants"] = int(nb_of_variants)
620
621        ### Samples
622
623        # Init
624        samples = {}
625        nb_of_samples = 0
626
627        # Check Samples
628        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
629            log.debug(f"Check samples...")
630            for sample in self.get_header_sample_list():
631                sql_query_samples = f"""
632                    SELECT  '{sample}' as sample,
633                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
634                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
635                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
636                    FROM {table_variants_from}
637                    WHERE (
638                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
639                        AND
640                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
641                      )
642                    GROUP BY genotype
643                    """
644                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
645                sample_genotype_count = sql_query_genotype_df["count"].sum()
646                if len(sql_query_genotype_df):
647                    nb_of_samples += 1
648                    samples[f"{sample} - {sample_genotype_count} variants"] = (
649                        sql_query_genotype_df.to_dict(orient="index")
650                    )
651
652            stats["Samples"] = samples
653            stats["Infos"]["Number of samples"] = nb_of_samples
654
655        # #
656        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
657        #     stats["Infos"]["Number of samples"] = nb_of_samples
658        # elif nb_of_samples:
659        #     stats["Infos"]["Number of samples"] = "not a VCF format"
660
661        ### INFO and FORMAT fields
662        header_types_df = {}
663        header_types_list = {
664            "List of INFO fields": header_infos,
665            "List of FORMAT fields": header_formats,
666        }
667        i = 0
668        for header_type in header_types_list:
669
670            header_type_infos = header_types_list.get(header_type)
671            header_infos_dict = {}
672
673            for info in header_type_infos:
674
675                i += 1
676                header_infos_dict[i] = {}
677
678                # ID
679                header_infos_dict[i]["id"] = info
680
681                # num
682                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
683                if header_type_infos[info].num in genotype_map.keys():
684                    header_infos_dict[i]["Number"] = genotype_map.get(
685                        header_type_infos[info].num
686                    )
687                else:
688                    header_infos_dict[i]["Number"] = header_type_infos[info].num
689
690                # type
691                if header_type_infos[info].type:
692                    header_infos_dict[i]["Type"] = header_type_infos[info].type
693                else:
694                    header_infos_dict[i]["Type"] = "."
695
696                # desc
697                if header_type_infos[info].desc != None:
698                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
699                else:
700                    header_infos_dict[i]["Description"] = ""
701
702            if len(header_infos_dict):
703                header_types_df[header_type] = pd.DataFrame.from_dict(
704                    header_infos_dict, orient="index"
705                ).to_dict(orient="index")
706
707        # Stats
708        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
709        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
710        stats["Header"] = header_types_df
711
712        ### QUAL
713        if "QUAL" in self.get_header_columns():
714            sql_query_qual = f"""
715                    SELECT
716                        avg(CAST(QUAL AS INTEGER)) AS Average,
717                        min(CAST(QUAL AS INTEGER)) AS Minimum,
718                        max(CAST(QUAL AS INTEGER)) AS Maximum,
719                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
720                        median(CAST(QUAL AS INTEGER)) AS Median,
721                        variance(CAST(QUAL AS INTEGER)) AS Variance
722                    FROM {table_variants_from}
723                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
724                    """
725
726            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
727            stats["Quality"] = {"Stats": qual}
728
729        ### SNV and InDel
730
731        sql_query_snv = f"""
732            
733            SELECT Type, count FROM (
734
735                    SELECT
736                        'Total' AS Type,
737                        count(*) AS count
738                    FROM {table_variants_from}
739
740                    UNION
741
742                    SELECT
743                        'MNV' AS Type,
744                        count(*) AS count
745                    FROM {table_variants_from}
746                    WHERE len(REF) > 1 AND len(ALT) > 1
747                    AND len(REF) = len(ALT)
748
749                    UNION
750
751                    SELECT
752                        'InDel' AS Type,
753                        count(*) AS count
754                    FROM {table_variants_from}
755                    WHERE len(REF) > 1 OR len(ALT) > 1
756                    AND len(REF) != len(ALT)
757                    
758                    UNION
759
760                    SELECT
761                        'SNV' AS Type,
762                        count(*) AS count
763                    FROM {table_variants_from}
764                    WHERE len(REF) = 1 AND len(ALT) = 1
765
766                )
767
768            ORDER BY count DESC
769
770                """
771        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
772
773        sql_query_snv_substitution = f"""
774                SELECT
775                    concat(REF, '>', ALT) AS 'Substitution',
776                    count(*) AS count
777                FROM {table_variants_from}
778                WHERE len(REF) = 1 AND len(ALT) = 1
779                GROUP BY REF, ALT
780                ORDER BY count(*) DESC
781                """
782        snv_substitution = (
783            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
784        )
785        stats["Variants"]["Counts"] = snv_indel
786        stats["Variants"]["Substitutions"] = snv_substitution
787
788        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
790    def stats_to_file(self, file: str = None) -> str:
791        """
792        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
793        into a JSON object, and writes the JSON object to the specified file.
794
795        :param file: The `file` parameter is a string that represents the file path where the JSON data
796        will be written
797        :type file: str
798        :return: the name of the file that was written to.
799        """
800
801        # Get stats
802        stats = self.get_stats()
803
804        # Serializing json
805        json_object = json.dumps(stats, indent=4)
806
807        # Writing to sample.json
808        with open(file, "w") as outfile:
809            outfile.write(json_object)
810
811        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
813    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
814        """
815        The `print_stats` function generates a markdown file and prints the statistics contained in a
816        JSON file in a formatted manner.
817
818        :param output_file: The `output_file` parameter is a string that specifies the path and filename
819        of the output file where the stats will be printed in Markdown format. If no `output_file` is
820        provided, a temporary directory will be created and the stats will be saved in a file named
821        "stats.md" within that
822        :type output_file: str
823        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
824        file where the statistics will be saved. If no value is provided, a temporary directory will be
825        created and a default file name "stats.json" will be used
826        :type json_file: str
827        :return: The function `print_stats` does not return any value. It has a return type annotation
828        of `None`.
829        """
830
831        # Full path
832        output_file = full_path(output_file)
833        json_file = full_path(json_file)
834
835        with tempfile.TemporaryDirectory() as tmpdir:
836
837            # Files
838            if not output_file:
839                output_file = os.path.join(tmpdir, "stats.md")
840            if not json_file:
841                json_file = os.path.join(tmpdir, "stats.json")
842
843            # Create folders
844            if not os.path.exists(os.path.dirname(output_file)):
845                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
846            if not os.path.exists(os.path.dirname(json_file)):
847                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
848
849            # Create stats JSON file
850            stats_file = self.stats_to_file(file=json_file)
851
852            # Print stats file
853            with open(stats_file) as f:
854                stats = yaml.safe_load(f)
855
856            # Output
857            output_title = []
858            output_index = []
859            output = []
860
861            # Title
862            output_title.append("# HOWARD Stats")
863
864            # Index
865            output_index.append("## Index")
866
867            # Process sections
868            for section in stats:
869                infos = stats.get(section)
870                section_link = "#" + section.lower().replace(" ", "-")
871                output.append(f"## {section}")
872                output_index.append(f"- [{section}]({section_link})")
873
874                if len(infos):
875                    for info in infos:
876                        try:
877                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
878                            is_df = True
879                        except:
880                            try:
881                                df = pd.DataFrame.from_dict(
882                                    json.loads((infos.get(info))), orient="index"
883                                )
884                                is_df = True
885                            except:
886                                is_df = False
887                        if is_df:
888                            output.append(f"### {info}")
889                            info_link = "#" + info.lower().replace(" ", "-")
890                            output_index.append(f"   - [{info}]({info_link})")
891                            output.append(f"{df.to_markdown(index=False)}")
892                        else:
893                            output.append(f"- {info}: {infos.get(info)}")
894                else:
895                    output.append(f"NA")
896
897            # Write stats in markdown file
898            with open(output_file, "w") as fp:
899                for item in output_title:
900                    fp.write("%s\n" % item)
901                for item in output_index:
902                    fp.write("%s\n" % item)
903                for item in output:
904                    fp.write("%s\n" % item)
905
906            # Output stats in markdown
907            print("")
908            print("\n\n".join(output_title))
909            print("")
910            print("\n\n".join(output))
911            print("")
912
913        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
915    def get_input(self) -> str:
916        """
917        It returns the value of the input variable.
918        :return: The input is being returned.
919        """
920        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
922    def get_input_format(self, input_file: str = None) -> str:
923        """
924        This function returns the format of the input variable, either from the provided input file or
925        by prompting for input.
926
927        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
928        represents the file path of the input file. If no `input_file` is provided when calling the
929        method, it will default to `None`
930        :type input_file: str
931        :return: The format of the input variable is being returned.
932        """
933
934        if not input_file:
935            input_file = self.get_input()
936        input_format = get_file_format(input_file)
937        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
939    def get_input_compressed(self, input_file: str = None) -> str:
940        """
941        The function `get_input_compressed` returns the format of the input variable after compressing
942        it.
943
944        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
945        that represents the file path of the input file. If no `input_file` is provided when calling the
946        method, it will default to `None` and the method will then call `self.get_input()` to
947        :type input_file: str
948        :return: The function `get_input_compressed` returns the compressed format of the input
949        variable.
950        """
951
952        if not input_file:
953            input_file = self.get_input()
954        input_compressed = get_file_compressed(input_file)
955        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
957    def get_output(self) -> str:
958        """
959        It returns the output of the neuron.
960        :return: The output of the neural network.
961        """
962
963        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
965    def get_output_format(self, output_file: str = None) -> str:
966        """
967        The function `get_output_format` returns the format of the input variable or the output file if
968        provided.
969
970        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
971        that represents the file path of the output file. If no `output_file` is provided when calling
972        the method, it will default to the output obtained from the `get_output` method of the class
973        instance. The
974        :type output_file: str
975        :return: The format of the input variable is being returned.
976        """
977
978        if not output_file:
979            output_file = self.get_output()
980        output_format = get_file_format(output_file)
981
982        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
984    def get_config(self) -> dict:
985        """
986        It returns the config
987        :return: The config variable is being returned.
988        """
989        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
991    def get_param(self) -> dict:
992        """
993        It returns the param
994        :return: The param variable is being returned.
995        """
996        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
 998    def get_connexion_db(self) -> str:
 999        """
1000        It returns the connexion_db attribute of the object
1001        :return: The connexion_db is being returned.
1002        """
1003        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1005    def get_prefix(self) -> str:
1006        """
1007        It returns the prefix of the object.
1008        :return: The prefix is being returned.
1009        """
1010        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1012    def get_table_variants(self, clause: str = "select") -> str:
1013        """
1014        This function returns the table_variants attribute of the object
1015
1016        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1017        defaults to select (optional)
1018        :return: The table_variants attribute of the object.
1019        """
1020
1021        # Access
1022        access = self.get_config().get("access", None)
1023
1024        # Clauses "select", "where", "update"
1025        if clause in ["select", "where", "update"]:
1026            table_variants = self.table_variants
1027        # Clause "from"
1028        elif clause in ["from"]:
1029            # For Read Only
1030            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1031                input_file = self.get_input()
1032                table_variants = f"'{input_file}' as variants"
1033            # For Read Write
1034            else:
1035                table_variants = f"{self.table_variants} as variants"
1036        else:
1037            table_variants = self.table_variants
1038        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1040    def get_tmp_dir(self) -> str:
1041        """
1042        The function `get_tmp_dir` returns the temporary directory path based on configuration
1043        parameters or a default path.
1044        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1045        configuration, parameters, and a default value of "/tmp".
1046        """
1047
1048        return get_tmp(
1049            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1050        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1052    def get_connexion_type(self) -> str:
1053        """
1054        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1055
1056        :return: The connexion type is being returned.
1057        """
1058        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1060    def get_connexion(self):
1061        """
1062        It returns the connection object
1063
1064        :return: The connection object.
1065        """
1066        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1068    def close_connexion(self) -> None:
1069        """
1070        This function closes the connection to the database.
1071        :return: The connection is being closed.
1072        """
1073        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1075    def get_header(self, type: str = "vcf"):
1076        """
1077        This function returns the header of the VCF file as a list of strings
1078
1079        :param type: the type of header you want to get, defaults to vcf (optional)
1080        :return: The header of the vcf file.
1081        """
1082
1083        if self.header_vcf:
1084            if type == "vcf":
1085                return self.header_vcf
1086            elif type == "list":
1087                return self.header_list
1088        else:
1089            if type == "vcf":
1090                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1091                return header
1092            elif type == "list":
1093                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1095    def get_header_length(self, file: str = None) -> int:
1096        """
1097        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1098        line.
1099
1100        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1101        header file. If this argument is provided, the function will read the header from the specified
1102        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1103        :type file: str
1104        :return: the length of the header list, excluding the #CHROM line.
1105        """
1106
1107        if file:
1108            return len(self.read_vcf_header_file(file=file)) - 1
1109        elif self.get_header(type="list"):
1110            return len(self.get_header(type="list")) - 1
1111        else:
1112            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1114    def get_header_columns(self) -> str:
1115        """
1116        This function returns the header list of a VCF
1117
1118        :return: The length of the header list.
1119        """
1120        if self.get_header():
1121            return self.get_header(type="list")[-1]
1122        else:
1123            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1125    def get_header_columns_as_list(self) -> list:
1126        """
1127        This function returns the header list of a VCF
1128
1129        :return: The length of the header list.
1130        """
1131        if self.get_header():
1132            return self.get_header_columns().strip().split("\t")
1133        else:
1134            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1136    def get_header_columns_as_sql(self) -> str:
1137        """
1138        This function retruns header length (without #CHROM line)
1139
1140        :return: The length of the header list.
1141        """
1142        sql_column_list = []
1143        for col in self.get_header_columns_as_list():
1144            sql_column_list.append(f'"{col}"')
1145        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1147    def get_header_sample_list(
1148        self, check: bool = False, samples: list = None, samples_force: bool = False
1149    ) -> list:
1150        """
1151        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1152        checking and filtering based on input parameters.
1153
1154        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1155        parameter that determines whether to check if the samples in the list are properly defined as
1156        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1157        list is defined as a, defaults to False
1158        :type check: bool (optional)
1159        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1160        allows you to specify a subset of samples from the header. If you provide a list of sample
1161        names, the function will check if each sample is defined in the header. If a sample is not found
1162        in the
1163        :type samples: list
1164        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1165        a boolean parameter that determines whether to force the function to return the sample list
1166        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1167        function will return the sample list without performing, defaults to False
1168        :type samples_force: bool (optional)
1169        :return: The function `get_header_sample_list` returns a list of samples based on the input
1170        parameters and conditions specified in the function.
1171        """
1172
1173        # Init
1174        samples_list = []
1175
1176        if samples is None:
1177            samples_list = self.header_vcf.samples
1178        else:
1179            samples_checked = []
1180            for sample in samples:
1181                if sample in self.header_vcf.samples:
1182                    samples_checked.append(sample)
1183                else:
1184                    log.warning(f"Sample '{sample}' not defined in header")
1185            samples_list = samples_checked
1186
1187            # Force sample list without checking if is_genotype_column
1188            if samples_force:
1189                log.warning(f"Samples {samples_list} not checked if genotypes")
1190                return samples_list
1191
1192        if check:
1193            samples_checked = []
1194            for sample in samples_list:
1195                if self.is_genotype_column(column=sample):
1196                    samples_checked.append(sample)
1197                else:
1198                    log.warning(
1199                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1200                    )
1201            samples_list = samples_checked
1202
1203        # Return samples list
1204        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1206    def is_genotype_column(self, column: str = None) -> bool:
1207        """
1208        This function checks if a given column is a genotype column in a database.
1209
1210        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1211        represents the column name in a database table. This method checks if the specified column is a
1212        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1213        method of
1214        :type column: str
1215        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1216        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1217        column name and returns the result. If the `column` parameter is None, it returns False.
1218        """
1219
1220        if column is not None:
1221            return Database(database=self.get_input()).is_genotype_column(column=column)
1222        else:
1223            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1225    def get_verbose(self) -> bool:
1226        """
1227        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1228        exist
1229
1230        :return: The value of the key "verbose" in the config dictionary.
1231        """
1232        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1234    def get_connexion_format(self) -> str:
1235        """
1236        It returns the connexion format of the object.
1237        :return: The connexion_format is being returned.
1238        """
1239        connexion_format = self.connexion_format
1240        if connexion_format not in ["duckdb", "sqlite"]:
1241            log.error(f"Unknown connexion format {connexion_format}")
1242            raise ValueError(f"Unknown connexion format {connexion_format}")
1243        else:
1244            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1246    def insert_file_to_table(
1247        self,
1248        file,
1249        columns: str,
1250        header_len: int = 0,
1251        sep: str = "\t",
1252        chunksize: int = 1000000,
1253    ) -> None:
1254        """
1255        The function reads a file in chunks and inserts each chunk into a table based on the specified
1256        database format.
1257
1258        :param file: The `file` parameter is the file that you want to load into a table. It should be
1259        the path to the file on your system
1260        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1261        should contain the names of the columns in the table where the data will be inserted. The column
1262        names should be separated by commas within the string. For example, if you have columns named
1263        "id", "name
1264        :type columns: str
1265        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1266        the number of lines to skip at the beginning of the file before reading the actual data. This
1267        parameter allows you to skip any header information present in the file before processing the
1268        data, defaults to 0
1269        :type header_len: int (optional)
1270        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1271        separator character that is used in the file being read. In this case, the default separator is
1272        set to `\t`, which represents a tab character. You can change this parameter to a different
1273        separator character if, defaults to \t
1274        :type sep: str (optional)
1275        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1276        when processing the file in chunks. In the provided code snippet, the default value for
1277        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1278        to 1000000
1279        :type chunksize: int (optional)
1280        """
1281
1282        # Config
1283        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1284        connexion_format = self.get_connexion_format()
1285
1286        log.debug("chunksize: " + str(chunksize))
1287
1288        if chunksize:
1289            for chunk in pd.read_csv(
1290                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1291            ):
1292                if connexion_format in ["duckdb"]:
1293                    sql_insert_into = (
1294                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1295                    )
1296                    self.conn.execute(sql_insert_into)
1297                elif connexion_format in ["sqlite"]:
1298                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1300    def load_data(
1301        self,
1302        input_file: str = None,
1303        drop_variants_table: bool = False,
1304        sample_size: int = 20480,
1305    ) -> None:
1306        """
1307        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1308        table before loading the data and specify a sample size.
1309
1310        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1311        table
1312        :type input_file: str
1313        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1314        determines whether the variants table should be dropped before loading the data. If set to
1315        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1316        not be dropped, defaults to False
1317        :type drop_variants_table: bool (optional)
1318        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1319        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1320        20480
1321        :type sample_size: int (optional)
1322        """
1323
1324        log.info("Loading...")
1325
1326        # change input file
1327        if input_file:
1328            self.set_input(input_file)
1329            self.set_header()
1330
1331        # drop variants table
1332        if drop_variants_table:
1333            self.drop_variants_table()
1334
1335        # get table variants
1336        table_variants = self.get_table_variants()
1337
1338        # Access
1339        access = self.get_config().get("access", None)
1340        log.debug(f"access: {access}")
1341
1342        # Input format and compress
1343        input_format = self.get_input_format()
1344        input_compressed = self.get_input_compressed()
1345        log.debug(f"input_format: {input_format}")
1346        log.debug(f"input_compressed: {input_compressed}")
1347
1348        # input_compressed_format
1349        if input_compressed:
1350            input_compressed_format = "gzip"
1351        else:
1352            input_compressed_format = "none"
1353        log.debug(f"input_compressed_format: {input_compressed_format}")
1354
1355        # Connexion format
1356        connexion_format = self.get_connexion_format()
1357
1358        # Sample size
1359        if not sample_size:
1360            sample_size = -1
1361        log.debug(f"sample_size: {sample_size}")
1362
1363        # Load data
1364        log.debug(f"Load Data from {input_format}")
1365
1366        # DuckDB connexion
1367        if connexion_format in ["duckdb"]:
1368
1369            # Database already exists
1370            if self.input_format in ["db", "duckdb"]:
1371
1372                if connexion_format in ["duckdb"]:
1373                    log.debug(f"Input file format '{self.input_format}' duckDB")
1374                else:
1375                    log.error(
1376                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1377                    )
1378                    raise ValueError(
1379                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1380                    )
1381
1382            # Load from existing database format
1383            else:
1384
1385                try:
1386                    # Create Table or View
1387                    database = Database(database=self.input)
1388                    sql_from = database.get_sql_from(sample_size=sample_size)
1389
1390                    if access in ["RO"]:
1391                        sql_load = (
1392                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1393                        )
1394                    else:
1395                        sql_load = (
1396                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1397                        )
1398                    self.conn.execute(sql_load)
1399
1400                except:
1401                    # Format not available
1402                    log.error(f"Input file format '{self.input_format}' not available")
1403                    raise ValueError(
1404                        f"Input file format '{self.input_format}' not available"
1405                    )
1406
1407        # SQLite connexion
1408        elif connexion_format in ["sqlite"] and input_format in [
1409            "vcf",
1410            "tsv",
1411            "csv",
1412            "psv",
1413        ]:
1414
1415            # Main structure
1416            structure = {
1417                "#CHROM": "VARCHAR",
1418                "POS": "INTEGER",
1419                "ID": "VARCHAR",
1420                "REF": "VARCHAR",
1421                "ALT": "VARCHAR",
1422                "QUAL": "VARCHAR",
1423                "FILTER": "VARCHAR",
1424                "INFO": "VARCHAR",
1425            }
1426
1427            # Strcuture with samples
1428            structure_complete = structure
1429            if self.get_header_sample_list():
1430                structure["FORMAT"] = "VARCHAR"
1431                for sample in self.get_header_sample_list():
1432                    structure_complete[sample] = "VARCHAR"
1433
1434            # Columns list for create and insert
1435            sql_create_table_columns = []
1436            sql_create_table_columns_list = []
1437            for column in structure_complete:
1438                column_type = structure_complete[column]
1439                sql_create_table_columns.append(
1440                    f'"{column}" {column_type} default NULL'
1441                )
1442                sql_create_table_columns_list.append(f'"{column}"')
1443
1444            # Create database
1445            log.debug(f"Create Table {table_variants}")
1446            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1447            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1448            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1449            self.conn.execute(sql_create_table)
1450
1451            # chunksize define length of file chunk load file
1452            chunksize = 100000
1453
1454            # delimiter
1455            delimiter = file_format_delimiters.get(input_format, "\t")
1456
1457            # Load the input file
1458            with open(self.input, "rt") as input_file:
1459
1460                # Use the appropriate file handler based on the input format
1461                if input_compressed:
1462                    input_file = bgzf.open(self.input, "rt")
1463                if input_format in ["vcf"]:
1464                    header_len = self.get_header_length()
1465                else:
1466                    header_len = 0
1467
1468                # Insert the file contents into a table
1469                self.insert_file_to_table(
1470                    input_file,
1471                    columns=sql_create_table_columns_list_sql,
1472                    header_len=header_len,
1473                    sep=delimiter,
1474                    chunksize=chunksize,
1475                )
1476
1477        else:
1478            log.error(
1479                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1480            )
1481            raise ValueError(
1482                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1483            )
1484
1485        # Explode INFOS fields into table fields
1486        if self.get_explode_infos():
1487            self.explode_infos(
1488                prefix=self.get_explode_infos_prefix(),
1489                fields=self.get_explode_infos_fields(),
1490                force=True,
1491            )
1492
1493        # Create index after insertion
1494        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1496    def get_explode_infos(self) -> bool:
1497        """
1498        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1499        to False if it is not set.
1500        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1501        value. If the parameter is not present, it will return False.
1502        """
1503
1504        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1506    def get_explode_infos_fields(
1507        self,
1508        explode_infos_fields: str = None,
1509        remove_fields_not_in_header: bool = False,
1510    ) -> list:
1511        """
1512        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1513        the input parameter `explode_infos_fields`.
1514
1515        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1516        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1517        comma-separated list of field names to explode
1518        :type explode_infos_fields: str
1519        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1520        flag that determines whether to remove fields that are not present in the header. If it is set
1521        to `True`, any field that is not in the header will be excluded from the list of exploded
1522        information fields. If it is set to `, defaults to False
1523        :type remove_fields_not_in_header: bool (optional)
1524        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1525        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1526        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1527        Otherwise, it returns a list of exploded information fields after removing any spaces and
1528        splitting the string by commas.
1529        """
1530
1531        # If no fields, get it in param
1532        if not explode_infos_fields:
1533            explode_infos_fields = (
1534                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1535            )
1536
1537        # If no fields, defined as all fields in header using keyword
1538        if not explode_infos_fields:
1539            explode_infos_fields = "*"
1540
1541        # If fields list not empty
1542        if explode_infos_fields:
1543
1544            # Input fields list
1545            if isinstance(explode_infos_fields, str):
1546                fields_input = explode_infos_fields.split(",")
1547            elif isinstance(explode_infos_fields, list):
1548                fields_input = explode_infos_fields
1549            else:
1550                fields_input = []
1551
1552            # Fields list without * keyword
1553            fields_without_all = fields_input.copy()
1554            if "*".casefold() in (item.casefold() for item in fields_without_all):
1555                fields_without_all.remove("*")
1556
1557            # Fields in header
1558            fields_in_header = sorted(list(set(self.get_header().infos)))
1559
1560            # Construct list of fields
1561            fields_output = []
1562            for field in fields_input:
1563
1564                # Strip field
1565                field = field.strip()
1566
1567                # format keyword * in regex
1568                if field.upper() in ["*"]:
1569                    field = ".*"
1570
1571                # Find all fields with pattern
1572                r = re.compile(field)
1573                fields_search = sorted(list(filter(r.match, fields_in_header)))
1574
1575                # Remove fields input from search
1576                if field in fields_search:
1577                    fields_search = [field]
1578                elif fields_search != [field]:
1579                    fields_search = sorted(
1580                        list(set(fields_search).difference(fields_input))
1581                    )
1582
1583                # If field is not in header (avoid not well formatted header)
1584                if not fields_search and not remove_fields_not_in_header:
1585                    fields_search = [field]
1586
1587                # Add found fields
1588                for new_field in fields_search:
1589                    # Add field, if not already exists, and if it is in header (if asked)
1590                    if (
1591                        new_field not in fields_output
1592                        and (
1593                            not remove_fields_not_in_header
1594                            or new_field in fields_in_header
1595                        )
1596                        and new_field not in [".*"]
1597                    ):
1598                        fields_output.append(new_field)
1599
1600            return fields_output
1601
1602        else:
1603
1604            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1606    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1607        """
1608        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1609        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1610        not provided.
1611
1612        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1613        prefix to be used for exploding or expanding information
1614        :type explode_infos_prefix: str
1615        :return: the value of the variable `explode_infos_prefix`.
1616        """
1617
1618        if not explode_infos_prefix:
1619            explode_infos_prefix = (
1620                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1621            )
1622
1623        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1625    def add_column(
1626        self,
1627        table_name,
1628        column_name,
1629        column_type,
1630        default_value=None,
1631        drop: bool = False,
1632    ) -> dict:
1633        """
1634        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1635        doesn't already exist.
1636
1637        :param table_name: The name of the table to which you want to add a column
1638        :param column_name: The parameter "column_name" is the name of the column that you want to add
1639        to the table
1640        :param column_type: The `column_type` parameter specifies the data type of the column that you
1641        want to add to the table. It should be a string that represents the desired data type, such as
1642        "INTEGER", "TEXT", "REAL", etc
1643        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1644        default value for the newly added column. If a default value is provided, it will be assigned to
1645        the column for any existing rows that do not have a value for that column
1646        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1647        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1648        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1649        to False
1650        :type drop: bool (optional)
1651        :return: a boolean value indicating whether the column was successfully added to the table.
1652        """
1653
1654        # added
1655        added = False
1656        dropped = False
1657
1658        # Check if the column already exists in the table
1659        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1660        columns = self.get_query_to_df(query).columns.tolist()
1661        if column_name.upper() in [c.upper() for c in columns]:
1662            log.debug(
1663                f"The {column_name} column already exists in the {table_name} table"
1664            )
1665            if drop:
1666                self.drop_column(table_name=table_name, column_name=column_name)
1667                dropped = True
1668            else:
1669                return None
1670        else:
1671            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1672
1673        # Add column in table
1674        add_column_query = (
1675            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1676        )
1677        if default_value is not None:
1678            add_column_query += f" DEFAULT {default_value}"
1679        self.execute_query(add_column_query)
1680        added = not dropped
1681        log.debug(
1682            f"The {column_name} column was successfully added to the {table_name} table"
1683        )
1684
1685        if added:
1686            added_column = {
1687                "table_name": table_name,
1688                "column_name": column_name,
1689                "column_type": column_type,
1690                "default_value": default_value,
1691            }
1692        else:
1693            added_column = None
1694
1695        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1697    def drop_column(
1698        self, column: dict = None, table_name: str = None, column_name: str = None
1699    ) -> bool:
1700        """
1701        The `drop_column` function drops a specified column from a given table in a database and returns
1702        True if the column was successfully dropped, and False if the column does not exist in the
1703        table.
1704
1705        :param column: The `column` parameter is a dictionary that contains information about the column
1706        you want to drop. It has two keys:
1707        :type column: dict
1708        :param table_name: The `table_name` parameter is the name of the table from which you want to
1709        drop a column
1710        :type table_name: str
1711        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1712        from the table
1713        :type column_name: str
1714        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1715        and False if the column does not exist in the table.
1716        """
1717
1718        # Find column infos
1719        if column:
1720            if isinstance(column, dict):
1721                table_name = column.get("table_name", None)
1722                column_name = column.get("column_name", None)
1723            elif isinstance(column, str):
1724                table_name = self.get_table_variants()
1725                column_name = column
1726            else:
1727                table_name = None
1728                column_name = None
1729
1730        if not table_name and not column_name:
1731            return False
1732
1733        # Removed
1734        removed = False
1735
1736        # Check if the column already exists in the table
1737        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1738        columns = self.get_query_to_df(query).columns.tolist()
1739        if column_name in columns:
1740            log.debug(f"The {column_name} column exists in the {table_name} table")
1741        else:
1742            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1743            return False
1744
1745        # Add column in table # ALTER TABLE integers DROP k
1746        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1747        self.execute_query(add_column_query)
1748        removed = True
1749        log.debug(
1750            f"The {column_name} column was successfully dropped to the {table_name} table"
1751        )
1752
1753        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1755    def explode_infos(
1756        self,
1757        prefix: str = None,
1758        create_index: bool = False,
1759        fields: list = None,
1760        force: bool = False,
1761        proccess_all_fields_together: bool = False,
1762        table: str = None,
1763    ) -> list:
1764        """
1765        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1766        individual columns, returning a list of added columns.
1767
1768        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1769        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1770        `self.get_explode_infos_prefix()` as the prefix
1771        :type prefix: str
1772        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1773        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1774        `False`, indexes will not be created. The default value is `False`, defaults to False
1775        :type create_index: bool (optional)
1776        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1777        that you want to explode into individual columns. If this parameter is not provided, all INFO
1778        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1779        a list to the `
1780        :type fields: list
1781        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1782        determines whether to drop and recreate a column if it already exists in the table. If `force`
1783        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1784        defaults to False
1785        :type force: bool (optional)
1786        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1787        flag that determines whether to process all the INFO fields together or individually. If set to
1788        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1789        be processed individually. The default value is, defaults to False
1790        :type proccess_all_fields_together: bool (optional)
1791        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1792        of the table where the exploded INFO fields will be added as individual columns. If you provide
1793        a value for the `table` parameter, the function will use that table name. If the `table`
1794        parameter is
1795        :type table: str
1796        :return: The `explode_infos` function returns a list of added columns.
1797        """
1798
1799        # drop indexes
1800        self.drop_indexes()
1801
1802        # connexion format
1803        connexion_format = self.get_connexion_format()
1804
1805        # Access
1806        access = self.get_config().get("access", None)
1807
1808        # Added columns
1809        added_columns = []
1810
1811        if access not in ["RO"]:
1812
1813            # prefix
1814            if prefix in [None, True] or not isinstance(prefix, str):
1815                if self.get_explode_infos_prefix() not in [None, True]:
1816                    prefix = self.get_explode_infos_prefix()
1817                else:
1818                    prefix = "INFO/"
1819
1820            # table variants
1821            if table is not None:
1822                table_variants = table
1823            else:
1824                table_variants = self.get_table_variants(clause="select")
1825
1826            # extra infos
1827            try:
1828                extra_infos = self.get_extra_infos()
1829            except:
1830                extra_infos = []
1831
1832            # Header infos
1833            header_infos = self.get_header().infos
1834
1835            log.debug(
1836                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1837            )
1838
1839            sql_info_alter_table_array = []
1840
1841            # Info fields to check
1842            fields_list = list(header_infos)
1843            if fields:
1844                fields_list += fields
1845            fields_list = set(fields_list)
1846
1847            # If no fields
1848            if not fields:
1849                fields = []
1850
1851            # Translate fields if patterns
1852            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1853
1854            for info in fields:
1855
1856                info_id_sql = prefix + info
1857
1858                if (
1859                    info in fields_list
1860                    or prefix + info in fields_list
1861                    or info in extra_infos
1862                ):
1863
1864                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1865
1866                    if info in header_infos:
1867                        info_type = header_infos[info].type
1868                        info_num = header_infos[info].num
1869                    else:
1870                        info_type = "String"
1871                        info_num = 0
1872
1873                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1874                    if info_num != 1:
1875                        type_sql = "VARCHAR"
1876
1877                    # Add field
1878                    added_column = self.add_column(
1879                        table_name=table_variants,
1880                        column_name=info_id_sql,
1881                        column_type=type_sql,
1882                        default_value="null",
1883                        drop=force,
1884                    )
1885
1886                    if added_column:
1887                        added_columns.append(added_column)
1888
1889                    if added_column or force:
1890
1891                        # add field to index
1892                        self.index_additionnal_fields.append(info_id_sql)
1893
1894                        # Update field array
1895                        if connexion_format in ["duckdb"]:
1896                            update_info_field = f"""
1897                            "{info_id_sql}" =
1898                                CASE
1899                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1900                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1901                                END
1902                            """
1903                        elif connexion_format in ["sqlite"]:
1904                            update_info_field = f"""
1905                                "{info_id_sql}" =
1906                                    CASE
1907                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1908                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1909                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1910                                    END
1911                            """
1912
1913                        sql_info_alter_table_array.append(update_info_field)
1914
1915            if sql_info_alter_table_array:
1916
1917                # By chromosomes
1918                try:
1919                    chromosomes_list = list(
1920                        self.get_query_to_df(
1921                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1922                        )["#CHROM"]
1923                    )
1924                except:
1925                    chromosomes_list = [None]
1926
1927                for chrom in chromosomes_list:
1928                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1929
1930                    # Where clause
1931                    where_clause = ""
1932                    if chrom and len(chromosomes_list) > 1:
1933                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1934
1935                    # Update table
1936                    if proccess_all_fields_together:
1937                        sql_info_alter_table_array_join = ", ".join(
1938                            sql_info_alter_table_array
1939                        )
1940                        if sql_info_alter_table_array_join:
1941                            sql_info_alter_table = f"""
1942                                UPDATE {table_variants}
1943                                SET {sql_info_alter_table_array_join}
1944                                {where_clause}
1945                                """
1946                            log.debug(
1947                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1948                            )
1949                            # log.debug(sql_info_alter_table)
1950                            self.conn.execute(sql_info_alter_table)
1951                    else:
1952                        sql_info_alter_num = 0
1953                        for sql_info_alter in sql_info_alter_table_array:
1954                            sql_info_alter_num += 1
1955                            sql_info_alter_table = f"""
1956                                UPDATE {table_variants}
1957                                SET {sql_info_alter}
1958                                {where_clause}
1959                                """
1960                            log.debug(
1961                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1962                            )
1963                            # log.debug(sql_info_alter_table)
1964                            self.conn.execute(sql_info_alter_table)
1965
1966        # create indexes
1967        if create_index:
1968            self.create_indexes()
1969
1970        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1972    def create_indexes(self) -> None:
1973        """
1974        Create indexes on the table after insertion
1975        """
1976
1977        # Access
1978        access = self.get_config().get("access", None)
1979
1980        # get table variants
1981        table_variants = self.get_table_variants("FROM")
1982
1983        if self.get_indexing() and access not in ["RO"]:
1984            # Create index
1985            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1986            self.conn.execute(sql_create_table_index)
1987            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1988            self.conn.execute(sql_create_table_index)
1989            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1990            self.conn.execute(sql_create_table_index)
1991            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1992            self.conn.execute(sql_create_table_index)
1993            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1994            self.conn.execute(sql_create_table_index)
1995            for field in self.index_additionnal_fields:
1996                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1997                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1999    def drop_indexes(self) -> None:
2000        """
2001        Create indexes on the table after insertion
2002        """
2003
2004        # Access
2005        access = self.get_config().get("access", None)
2006
2007        # get table variants
2008        table_variants = self.get_table_variants("FROM")
2009
2010        # Get database format
2011        connexion_format = self.get_connexion_format()
2012
2013        if access not in ["RO"]:
2014            if connexion_format in ["duckdb"]:
2015                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2016            elif connexion_format in ["sqlite"]:
2017                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2018
2019            list_indexes = self.conn.execute(sql_list_indexes)
2020            index_names = [row[0] for row in list_indexes.fetchall()]
2021            for index in index_names:
2022                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2023                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2025    def read_vcf_header(self, f) -> list:
2026        """
2027        It reads the header of a VCF file and returns a list of the header lines
2028
2029        :param f: the file object
2030        :return: The header lines of the VCF file.
2031        """
2032
2033        header_list = []
2034        for line in f:
2035            header_list.append(line)
2036            if line.startswith("#CHROM"):
2037                break
2038        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2040    def read_vcf_header_file(self, file: str = None) -> list:
2041        """
2042        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2043        uncompressed files.
2044
2045        :param file: The `file` parameter is a string that represents the path to the VCF header file
2046        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2047        default to `None`
2048        :type file: str
2049        :return: The function `read_vcf_header_file` returns a list.
2050        """
2051
2052        if self.get_input_compressed(input_file=file):
2053            with bgzf.open(file, "rt") as f:
2054                return self.read_vcf_header(f=f)
2055        else:
2056            with open(file, "rt") as f:
2057                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2059    def execute_query(self, query: str):
2060        """
2061        It takes a query as an argument, executes it, and returns the results
2062
2063        :param query: The query to be executed
2064        :return: The result of the query is being returned.
2065        """
2066        if query:
2067            return self.conn.execute(query)  # .fetchall()
2068        else:
2069            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
2071    def export_output(
2072        self,
2073        output_file: str | None = None,
2074        output_header: str | None = None,
2075        export_header: bool = True,
2076        query: str | None = None,
2077        parquet_partitions: list | None = None,
2078        chunk_size: int | None = None,
2079        threads: int | None = None,
2080        sort: bool = False,
2081        index: bool = False,
2082        order_by: str | None = None,
2083    ) -> bool:
2084        """
2085        The `export_output` function exports data from a VCF file to a specified output file in various
2086        formats, including VCF, CSV, TSV, PSV, and Parquet.
2087
2088        :param output_file: The `output_file` parameter is a string that specifies the name of the
2089        output file to be generated by the function. This is where the exported data will be saved
2090        :type output_file: str
2091        :param output_header: The `output_header` parameter is a string that specifies the name of the
2092        file where the header of the VCF file will be exported. If this parameter is not provided, the
2093        header will be exported to a file with the same name as the `output_file` parameter, but with
2094        the extension "
2095        :type output_header: str
2096        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2097        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2098        True, the header will be exported to a file. If `export_header` is False, the header will not
2099        be, defaults to True, if output format is not VCF
2100        :type export_header: bool (optional)
2101        :param query: The `query` parameter is an optional SQL query that can be used to filter and
2102        select specific data from the VCF file before exporting it. If provided, only the data that
2103        matches the query will be exported
2104        :type query: str
2105        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2106        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2107        organize data in a hierarchical directory structure based on the values of one or more columns.
2108        This can improve query performance when working with large datasets
2109        :type parquet_partitions: list
2110        :param chunk_size: The `chunk_size` parameter specifies the number of
2111        records in batch when exporting data in Parquet format. This parameter is used for
2112        partitioning the Parquet file into multiple files.
2113        :type chunk_size: int
2114        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2115        threads to be used during the export process. It determines the level of parallelism and can
2116        improve the performance of the export operation. If not provided, the function will use the
2117        default number of threads
2118        :type threads: int
2119        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2120        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2121        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2122        False
2123        :type sort: bool (optional)
2124        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2125        created on the output file. If `index` is True, an index will be created. If `index` is False,
2126        no index will be created. The default value is False, defaults to False
2127        :type index: bool (optional)
2128        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2129        sorting the output file. This parameter is only applicable when exporting data in VCF format
2130        :type order_by: str
2131        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2132        None if it doesn't.
2133        """
2134
2135        # Log
2136        log.info("Exporting...")
2137
2138        # Full path
2139        output_file = full_path(output_file)
2140        output_header = full_path(output_header)
2141
2142        # Config
2143        config = self.get_config()
2144
2145        # Param
2146        param = self.get_param()
2147
2148        # Tmp files to remove
2149        tmp_to_remove = []
2150
2151        # If no output, get it
2152        if not output_file:
2153            output_file = self.get_output()
2154
2155        # If not threads
2156        if not threads:
2157            threads = self.get_threads()
2158
2159        # Auto header name with extension
2160        if export_header or output_header:
2161            if not output_header:
2162                output_header = f"{output_file}.hdr"
2163            # Export header
2164            self.export_header(output_file=output_file)
2165
2166        # Switch off export header if VCF output
2167        output_file_type = get_file_format(output_file)
2168        if output_file_type in ["vcf"]:
2169            export_header = False
2170            tmp_to_remove.append(output_header)
2171
2172        # Chunk size
2173        if not chunk_size:
2174            chunk_size = config.get("chunk_size", None)
2175
2176        # Parquet partition
2177        if not parquet_partitions:
2178            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2179        if parquet_partitions and isinstance(parquet_partitions, str):
2180            parquet_partitions = parquet_partitions.split(",")
2181
2182        # Order by
2183        if not order_by:
2184            order_by = param.get("export", {}).get("order_by", "")
2185
2186        # Header in output
2187        header_in_output = param.get("export", {}).get("include_header", False)
2188
2189        # Database
2190        database_source = self.get_connexion()
2191
2192        # Connexion format
2193        connexion_format = self.get_connexion_format()
2194
2195        # Explode infos
2196        if self.get_explode_infos():
2197            self.explode_infos(
2198                prefix=self.get_explode_infos_prefix(),
2199                fields=self.get_explode_infos_fields(),
2200                force=False,
2201            )
2202
2203        # if connexion_format in ["sqlite"] or query:
2204        if connexion_format in ["sqlite"]:
2205
2206            # Export in Parquet
2207            random_tmp = "".join(
2208                random.choice(string.ascii_lowercase) for i in range(10)
2209            )
2210            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2211            tmp_to_remove.append(database_source)
2212
2213            # Table Variants
2214            table_variants = self.get_table_variants()
2215
2216            # Create export query
2217            sql_query_export_subquery = f"""
2218                SELECT * FROM {table_variants}
2219                """
2220
2221            # Write source file
2222            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2223
2224        # Create database
2225        database = Database(
2226            database=database_source,
2227            table="variants",
2228            header_file=output_header,
2229            conn_config=self.get_connexion_config(),
2230        )
2231
2232        # Existing colomns header
2233        existing_columns_header = database.get_header_columns_from_database()
2234
2235        # Sample list
2236        get_samples = self.get_samples()
2237        get_samples_check = self.get_samples_check()
2238        samples_force = get_samples is not None
2239        sample_list = self.get_header_sample_list(
2240            check=get_samples_check, samples=get_samples, samples_force=samples_force
2241        )
2242
2243        # Export file
2244        database.export(
2245            output_database=output_file,
2246            output_header=output_header,
2247            existing_columns_header=existing_columns_header,
2248            parquet_partitions=parquet_partitions,
2249            chunk_size=chunk_size,
2250            threads=threads,
2251            sort=sort,
2252            index=index,
2253            header_in_output=header_in_output,
2254            order_by=order_by,
2255            query=query,
2256            export_header=export_header,
2257            sample_list=sample_list,
2258        )
2259
2260        # Remove
2261        remove_if_exists(tmp_to_remove)
2262
2263        return (os.path.exists(output_file) or None) and (
2264            os.path.exists(output_file) or None
2265        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2267    def get_extra_infos(self, table: str = None) -> list:
2268        """
2269        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2270        in the header.
2271
2272        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2273        name of the table from which you want to retrieve the extra columns that are not present in the
2274        header. If the `table` parameter is not provided when calling the function, it will default to
2275        using the variants
2276        :type table: str
2277        :return: A list of columns that are in the specified table but not in the header of the table.
2278        """
2279
2280        header_columns = []
2281
2282        if not table:
2283            table = self.get_table_variants(clause="from")
2284            header_columns = self.get_header_columns()
2285
2286        # Check all columns in the database
2287        query = f""" SELECT * FROM {table} LIMIT 1 """
2288        log.debug(f"query {query}")
2289        table_columns = self.get_query_to_df(query).columns.tolist()
2290        extra_columns = []
2291
2292        # Construct extra infos (not in header)
2293        for column in table_columns:
2294            if column not in header_columns:
2295                extra_columns.append(column)
2296
2297        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2299    def get_extra_infos_sql(self, table: str = None) -> str:
2300        """
2301        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2302        by double quotes
2303
2304        :param table: The name of the table to get the extra infos from. If None, the default table is
2305        used
2306        :type table: str
2307        :return: A string of the extra infos
2308        """
2309
2310        return ", ".join(
2311            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2312        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2314    def export_header(
2315        self,
2316        header_name: str = None,
2317        output_file: str = None,
2318        output_file_ext: str = ".hdr",
2319        clean_header: bool = True,
2320        remove_chrom_line: bool = False,
2321    ) -> str:
2322        """
2323        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2324        specified options, and writes it to a new file.
2325
2326        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2327        this parameter is not specified, the header will be written to the output file
2328        :type header_name: str
2329        :param output_file: The `output_file` parameter in the `export_header` function is used to
2330        specify the name of the output file where the header will be written. If this parameter is not
2331        provided, the header will be written to a temporary file
2332        :type output_file: str
2333        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2334        string that represents the extension of the output header file. By default, it is set to ".hdr"
2335        if not specified by the user. This extension will be appended to the `output_file` name to
2336        create the final, defaults to .hdr
2337        :type output_file_ext: str (optional)
2338        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2339        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2340        `True`, the function will clean the header by modifying certain lines based on a specific
2341        pattern. If `clean_header`, defaults to True
2342        :type clean_header: bool (optional)
2343        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2344        boolean flag that determines whether the #CHROM line should be removed from the header before
2345        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2346        defaults to False
2347        :type remove_chrom_line: bool (optional)
2348        :return: The function `export_header` returns the name of the temporary header file that is
2349        created.
2350        """
2351
2352        if not header_name and not output_file:
2353            output_file = self.get_output()
2354
2355        if self.get_header():
2356
2357            # Get header object
2358            header_obj = self.get_header()
2359
2360            # Create database
2361            db_for_header = Database(database=self.get_input())
2362
2363            # Get real columns in the file
2364            db_header_columns = db_for_header.get_columns()
2365
2366            with tempfile.TemporaryDirectory() as tmpdir:
2367
2368                # Write header file
2369                header_file_tmp = os.path.join(tmpdir, "header")
2370                f = open(header_file_tmp, "w")
2371                vcf.Writer(f, header_obj)
2372                f.close()
2373
2374                # Replace #CHROM line with rel columns
2375                header_list = db_for_header.read_header_file(
2376                    header_file=header_file_tmp
2377                )
2378                header_list[-1] = "\t".join(db_header_columns)
2379
2380                # Remove CHROM line
2381                if remove_chrom_line:
2382                    header_list.pop()
2383
2384                # Clean header
2385                if clean_header:
2386                    header_list_clean = []
2387                    for head in header_list:
2388                        # Clean head for malformed header
2389                        head_clean = head
2390                        head_clean = re.subn(
2391                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2392                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2393                            head_clean,
2394                            2,
2395                        )[0]
2396                        # Write header
2397                        header_list_clean.append(head_clean)
2398                    header_list = header_list_clean
2399
2400            tmp_header_name = output_file + output_file_ext
2401
2402            f = open(tmp_header_name, "w")
2403            for line in header_list:
2404                f.write(line)
2405            f.close()
2406
2407        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2409    def export_variant_vcf(
2410        self,
2411        vcf_file,
2412        remove_info: bool = False,
2413        add_samples: bool = True,
2414        list_samples: list = [],
2415        where_clause: str = "",
2416        index: bool = False,
2417        threads: int | None = None,
2418    ) -> bool | None:
2419        """
2420        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2421        remove INFO field, add samples, and control compression and indexing.
2422
2423        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2424        written to. It is the output file that will contain the filtered VCF data based on the specified
2425        parameters
2426        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2427        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2428        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2429        in, defaults to False
2430        :type remove_info: bool (optional)
2431        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2432        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2433        If set to False, the samples will be removed. The default value is True, defaults to True
2434        :type add_samples: bool (optional)
2435        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2436        in the output VCF file. By default, all samples will be included. If you provide a list of
2437        samples, only those samples will be included in the output file
2438        :type list_samples: list
2439        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2440        determines whether or not to create an index for the output VCF file. If `index` is set to
2441        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2442        :type index: bool (optional)
2443        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2444        number of threads to use for exporting the VCF file. It determines how many parallel threads
2445        will be used during the export process. More threads can potentially speed up the export process
2446        by utilizing multiple cores of the processor. If
2447        :type threads: int | None
2448        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2449        method with various parameters including the output file, query, threads, sort flag, and index
2450        flag. The `export_output` method is responsible for exporting the VCF data based on the
2451        specified parameters and configurations provided in the `export_variant_vcf` function.
2452        """
2453
2454        # Config
2455        config = self.get_config()
2456
2457        # Extract VCF
2458        log.debug("Export VCF...")
2459
2460        # Table variants
2461        table_variants = self.get_table_variants()
2462
2463        # Threads
2464        if not threads:
2465            threads = self.get_threads()
2466
2467        # Info fields
2468        if remove_info:
2469            if not isinstance(remove_info, str):
2470                remove_info = "."
2471            info_field = f"""'{remove_info}' as INFO"""
2472        else:
2473            info_field = "INFO"
2474
2475        # Samples fields
2476        if add_samples:
2477            if not list_samples:
2478                list_samples = self.get_header_sample_list()
2479            if list_samples:
2480                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2481            else:
2482                samples_fields = ""
2483            log.debug(f"samples_fields: {samples_fields}")
2484        else:
2485            samples_fields = ""
2486
2487        # Where clause
2488        if where_clause is None:
2489            where_clause = ""
2490
2491        # Variants
2492        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2493        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2494        log.debug(f"sql_query_select={sql_query_select}")
2495
2496        return self.export_output(
2497            output_file=vcf_file,
2498            output_header=None,
2499            export_header=True,
2500            query=sql_query_select,
2501            parquet_partitions=None,
2502            chunk_size=config.get("chunk_size", None),
2503            threads=threads,
2504            sort=True,
2505            index=index,
2506            order_by=None,
2507        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2509    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2510        """
2511        It takes a list of commands and runs them in parallel using the number of threads specified
2512
2513        :param commands: A list of commands to run
2514        :param threads: The number of threads to use, defaults to 1 (optional)
2515        """
2516
2517        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2519    def get_threads(self, default: int = 1) -> int:
2520        """
2521        This function returns the number of threads to use for a job, with a default value of 1 if not
2522        specified.
2523
2524        :param default: The `default` parameter in the `get_threads` method is used to specify the
2525        default number of threads to use if no specific value is provided. If no value is provided for
2526        the `threads` parameter in the configuration or input parameters, the `default` value will be
2527        used, defaults to 1
2528        :type default: int (optional)
2529        :return: the number of threads to use for the current job.
2530        """
2531
2532        # Config
2533        config = self.get_config()
2534
2535        # Param
2536        param = self.get_param()
2537
2538        # Input threads
2539        input_thread = param.get("threads", config.get("threads", None))
2540
2541        # Check threads
2542        if not input_thread:
2543            threads = default
2544        elif int(input_thread) <= 0:
2545            threads = os.cpu_count()
2546        else:
2547            threads = int(input_thread)
2548        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2550    def get_memory(self, default: str = None) -> str:
2551        """
2552        This function retrieves the memory value from parameters or configuration with a default value
2553        if not found.
2554
2555        :param default: The `get_memory` function takes in a default value as a string parameter. This
2556        default value is used as a fallback in case the `memory` parameter is not provided in the
2557        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2558        the function
2559        :type default: str
2560        :return: The `get_memory` function returns a string value representing the memory parameter. If
2561        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2562        return the default value provided as an argument to the function.
2563        """
2564
2565        # Config
2566        config = self.get_config()
2567
2568        # Param
2569        param = self.get_param()
2570
2571        # Input threads
2572        input_memory = param.get("memory", config.get("memory", None))
2573
2574        # Check threads
2575        if input_memory:
2576            memory = input_memory
2577        else:
2578            memory = default
2579
2580        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2582    def update_from_vcf(self, vcf_file: str) -> None:
2583        """
2584        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2585
2586        :param vcf_file: the path to the VCF file
2587        """
2588
2589        connexion_format = self.get_connexion_format()
2590
2591        if connexion_format in ["duckdb"]:
2592            self.update_from_vcf_duckdb(vcf_file)
2593        elif connexion_format in ["sqlite"]:
2594            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2596    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2597        """
2598        It takes a VCF file and updates the INFO column of the variants table in the database with the
2599        INFO column of the VCF file
2600
2601        :param vcf_file: the path to the VCF file
2602        """
2603
2604        # varaints table
2605        table_variants = self.get_table_variants()
2606
2607        # Loading VCF into temporaire table
2608        skip = self.get_header_length(file=vcf_file)
2609        vcf_df = pd.read_csv(
2610            vcf_file,
2611            sep="\t",
2612            engine="c",
2613            skiprows=skip,
2614            header=0,
2615            low_memory=False,
2616        )
2617        sql_query_update = f"""
2618        UPDATE {table_variants} as table_variants
2619            SET INFO = concat(
2620                            CASE
2621                                WHEN INFO NOT IN ('', '.')
2622                                THEN INFO
2623                                ELSE ''
2624                            END,
2625                            (
2626                                SELECT 
2627                                    concat(
2628                                        CASE
2629                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2630                                            THEN ';'
2631                                            ELSE ''
2632                                        END
2633                                        ,
2634                                        CASE
2635                                            WHEN table_parquet.INFO NOT IN ('','.')
2636                                            THEN table_parquet.INFO
2637                                            ELSE ''
2638                                        END
2639                                    )
2640                                FROM vcf_df as table_parquet
2641                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2642                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2643                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2644                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2645                                        AND table_parquet.INFO NOT IN ('','.')
2646                            )
2647                        )
2648            ;
2649            """
2650        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2652    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2653        """
2654        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2655        table, then updates the INFO column of the variants table with the INFO column of the temporary
2656        table
2657
2658        :param vcf_file: The path to the VCF file you want to update the database with
2659        """
2660
2661        # Create a temporary table for the VCF
2662        table_vcf = "tmp_vcf"
2663        sql_create = (
2664            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2665        )
2666        self.conn.execute(sql_create)
2667
2668        # Loading VCF into temporaire table
2669        vcf_df = pd.read_csv(
2670            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2671        )
2672        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2673        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2674
2675        # Update table 'variants' with VCF data
2676        # warning: CONCAT as || operator
2677        sql_query_update = f"""
2678            UPDATE variants as table_variants
2679            SET INFO = CASE
2680                            WHEN INFO NOT IN ('', '.')
2681                            THEN INFO
2682                            ELSE ''
2683                        END ||
2684                        (
2685                        SELECT 
2686                            CASE 
2687                                WHEN table_variants.INFO NOT IN ('','.') 
2688                                    AND table_vcf.INFO NOT IN ('','.')  
2689                                THEN ';' 
2690                                ELSE '' 
2691                            END || 
2692                            CASE 
2693                                WHEN table_vcf.INFO NOT IN ('','.') 
2694                                THEN table_vcf.INFO 
2695                                ELSE '' 
2696                            END
2697                        FROM {table_vcf} as table_vcf
2698                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2699                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2700                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2701                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2702                        )
2703        """
2704        self.conn.execute(sql_query_update)
2705
2706        # Drop temporary table
2707        sql_drop = f"DROP TABLE {table_vcf}"
2708        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2710    def drop_variants_table(self) -> None:
2711        """
2712        > This function drops the variants table
2713        """
2714
2715        table_variants = self.get_table_variants()
2716        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2717        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2719    def set_variant_id(
2720        self, variant_id_column: str = "variant_id", force: bool = None
2721    ) -> str:
2722        """
2723        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2724        `#CHROM`, `POS`, `REF`, and `ALT` columns
2725
2726        :param variant_id_column: The name of the column to be created in the variants table, defaults
2727        to variant_id
2728        :type variant_id_column: str (optional)
2729        :param force: If True, the variant_id column will be created even if it already exists
2730        :type force: bool
2731        :return: The name of the column that contains the variant_id
2732        """
2733
2734        # Assembly
2735        assembly = self.get_param().get(
2736            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2737        )
2738
2739        # INFO/Tag prefix
2740        prefix = self.get_explode_infos_prefix()
2741
2742        # Explode INFO/SVTYPE
2743        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2744
2745        # variants table
2746        table_variants = self.get_table_variants()
2747
2748        # variant_id column
2749        if not variant_id_column:
2750            variant_id_column = "variant_id"
2751
2752        # Creta variant_id column
2753        if "variant_id" not in self.get_extra_infos() or force:
2754
2755            # Create column
2756            self.add_column(
2757                table_name=table_variants,
2758                column_name=variant_id_column,
2759                column_type="UBIGINT",
2760                default_value="0",
2761            )
2762
2763            # Update column
2764            self.conn.execute(
2765                f"""
2766                    UPDATE {table_variants}
2767                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2768                """
2769            )
2770
2771        # Remove added columns
2772        for added_column in added_columns:
2773            self.drop_column(column=added_column)
2774
2775        # return variant_id column name
2776        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2778    def get_variant_id_column(
2779        self, variant_id_column: str = "variant_id", force: bool = None
2780    ) -> str:
2781        """
2782        This function returns the variant_id column name
2783
2784        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2785        defaults to variant_id
2786        :type variant_id_column: str (optional)
2787        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2788        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2789        if it is not already set, or if it is set
2790        :type force: bool
2791        :return: The variant_id column name.
2792        """
2793
2794        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2800    def scan_databases(
2801        self,
2802        database_formats: list = ["parquet"],
2803        database_releases: list = ["current"],
2804    ) -> dict:
2805        """
2806        The function `scan_databases` scans for available databases based on specified formats and
2807        releases.
2808
2809        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2810        of the databases to be scanned. In this case, the accepted format is "parquet"
2811        :type database_formats: list ["parquet"]
2812        :param database_releases: The `database_releases` parameter is a list that specifies the
2813        releases of the databases to be scanned. In the provided function, the default value for
2814        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2815        databases that are in the "current"
2816        :type database_releases: list
2817        :return: The function `scan_databases` returns a dictionary containing information about
2818        databases that match the specified formats and releases.
2819        """
2820
2821        # Config
2822        config = self.get_config()
2823
2824        # Param
2825        param = self.get_param()
2826
2827        # Param - Assembly
2828        assembly = param.get("assembly", config.get("assembly", None))
2829        if not assembly:
2830            assembly = DEFAULT_ASSEMBLY
2831            log.warning(f"Default assembly '{assembly}'")
2832
2833        # Scan for availabled databases
2834        log.info(
2835            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2836        )
2837        databases_infos_dict = databases_infos(
2838            database_folder_releases=database_releases,
2839            database_formats=database_formats,
2840            assembly=assembly,
2841            config=config,
2842        )
2843        log.info(
2844            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2845        )
2846
2847        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2849    def annotation(self) -> None:
2850        """
2851        It annotates the VCF file with the annotations specified in the config file.
2852        """
2853
2854        # Config
2855        config = self.get_config()
2856
2857        # Param
2858        param = self.get_param()
2859
2860        # Param - Assembly
2861        assembly = param.get("assembly", config.get("assembly", None))
2862        if not assembly:
2863            assembly = DEFAULT_ASSEMBLY
2864            log.warning(f"Default assembly '{assembly}'")
2865
2866        # annotations databases folders
2867        annotations_databases = set(
2868            config.get("folders", {})
2869            .get("databases", {})
2870            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2871            + config.get("folders", {})
2872            .get("databases", {})
2873            .get("parquet", ["~/howard/databases/parquet/current"])
2874            + config.get("folders", {})
2875            .get("databases", {})
2876            .get("bcftools", ["~/howard/databases/bcftools/current"])
2877        )
2878
2879        # Get param annotations
2880        if param.get("annotations", None) and isinstance(
2881            param.get("annotations", None), str
2882        ):
2883            log.debug(param.get("annotations", None))
2884            param_annotation_list = param.get("annotations").split(",")
2885        else:
2886            param_annotation_list = []
2887
2888        # Each tools param
2889        if param.get("annotation_parquet", None) != None:
2890            log.debug(
2891                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2892            )
2893            if isinstance(param.get("annotation_parquet", None), list):
2894                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2895            else:
2896                param_annotation_list.append(param.get("annotation_parquet"))
2897        if param.get("annotation_snpsift", None) != None:
2898            if isinstance(param.get("annotation_snpsift", None), list):
2899                param_annotation_list.append(
2900                    "snpsift:"
2901                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2902                )
2903            else:
2904                param_annotation_list.append(
2905                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2906                )
2907        if param.get("annotation_snpeff", None) != None:
2908            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2909        if param.get("annotation_bcftools", None) != None:
2910            if isinstance(param.get("annotation_bcftools", None), list):
2911                param_annotation_list.append(
2912                    "bcftools:"
2913                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2914                )
2915            else:
2916                param_annotation_list.append(
2917                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2918                )
2919        if param.get("annotation_annovar", None) != None:
2920            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2921        if param.get("annotation_exomiser", None) != None:
2922            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2923        if param.get("annotation_splice", None) != None:
2924            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2925
2926        # Merge param annotations list
2927        param["annotations"] = ",".join(param_annotation_list)
2928
2929        # debug
2930        log.debug(f"param_annotations={param['annotations']}")
2931
2932        if param.get("annotations"):
2933
2934            # Log
2935            # log.info("Annotations - Check annotation parameters")
2936
2937            if not "annotation" in param:
2938                param["annotation"] = {}
2939
2940            # List of annotations parameters
2941            annotations_list_input = {}
2942            if isinstance(param.get("annotations", None), str):
2943                annotation_file_list = [
2944                    value for value in param.get("annotations", "").split(",")
2945                ]
2946                for annotation_file in annotation_file_list:
2947                    annotations_list_input[annotation_file] = {"INFO": None}
2948            else:
2949                annotations_list_input = param.get("annotations", {})
2950
2951            log.info(f"Quick Annotations:")
2952            for annotation_key in list(annotations_list_input.keys()):
2953                log.info(f"   {annotation_key}")
2954
2955            # List of annotations and associated fields
2956            annotations_list = {}
2957
2958            for annotation_file in annotations_list_input:
2959
2960                # Explode annotations if ALL
2961                if (
2962                    annotation_file.upper() == "ALL"
2963                    or annotation_file.upper().startswith("ALL:")
2964                ):
2965
2966                    # check ALL parameters (formats, releases)
2967                    annotation_file_split = annotation_file.split(":")
2968                    database_formats = "parquet"
2969                    database_releases = "current"
2970                    for annotation_file_option in annotation_file_split[1:]:
2971                        database_all_options_split = annotation_file_option.split("=")
2972                        if database_all_options_split[0] == "format":
2973                            database_formats = database_all_options_split[1].split("+")
2974                        if database_all_options_split[0] == "release":
2975                            database_releases = database_all_options_split[1].split("+")
2976
2977                    # Scan for availabled databases
2978                    databases_infos_dict = self.scan_databases(
2979                        database_formats=database_formats,
2980                        database_releases=database_releases,
2981                    )
2982
2983                    # Add found databases in annotation parameters
2984                    for database_infos in databases_infos_dict.keys():
2985                        annotations_list[database_infos] = {"INFO": None}
2986
2987                else:
2988                    annotations_list[annotation_file] = annotations_list_input[
2989                        annotation_file
2990                    ]
2991
2992            # Check each databases
2993            if len(annotations_list):
2994
2995                log.info(
2996                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2997                )
2998
2999                for annotation_file in annotations_list:
3000
3001                    # Init
3002                    annotations = annotations_list.get(annotation_file, None)
3003
3004                    # Annotation snpEff
3005                    if annotation_file.startswith("snpeff"):
3006
3007                        log.debug(f"Quick Annotation snpEff")
3008
3009                        if "snpeff" not in param["annotation"]:
3010                            param["annotation"]["snpeff"] = {}
3011
3012                        if "options" not in param["annotation"]["snpeff"]:
3013                            param["annotation"]["snpeff"]["options"] = ""
3014
3015                        # snpEff options in annotations
3016                        param["annotation"]["snpeff"]["options"] = "".join(
3017                            annotation_file.split(":")[1:]
3018                        )
3019
3020                    # Annotation Annovar
3021                    elif annotation_file.startswith("annovar"):
3022
3023                        log.debug(f"Quick Annotation Annovar")
3024
3025                        if "annovar" not in param["annotation"]:
3026                            param["annotation"]["annovar"] = {}
3027
3028                        if "annotations" not in param["annotation"]["annovar"]:
3029                            param["annotation"]["annovar"]["annotations"] = {}
3030
3031                        # Options
3032                        annotation_file_split = annotation_file.split(":")
3033                        for annotation_file_annotation in annotation_file_split[1:]:
3034                            if annotation_file_annotation:
3035                                param["annotation"]["annovar"]["annotations"][
3036                                    annotation_file_annotation
3037                                ] = annotations
3038
3039                    # Annotation Exomiser
3040                    elif annotation_file.startswith("exomiser"):
3041
3042                        log.debug(f"Quick Annotation Exomiser")
3043
3044                        param["annotation"]["exomiser"] = params_string_to_dict(
3045                            annotation_file
3046                        )
3047
3048                    # Annotation Splice
3049                    elif annotation_file.startswith("splice"):
3050
3051                        log.debug(f"Quick Annotation Splice")
3052
3053                        param["annotation"]["splice"] = params_string_to_dict(
3054                            annotation_file
3055                        )
3056
3057                    # Annotation Parquet or BCFTOOLS
3058                    else:
3059
3060                        # Tools detection
3061                        if annotation_file.startswith("bcftools:"):
3062                            annotation_tool_initial = "bcftools"
3063                            annotation_file = ":".join(annotation_file.split(":")[1:])
3064                        elif annotation_file.startswith("snpsift:"):
3065                            annotation_tool_initial = "snpsift"
3066                            annotation_file = ":".join(annotation_file.split(":")[1:])
3067                        else:
3068                            annotation_tool_initial = None
3069
3070                        # list of files
3071                        annotation_file_list = annotation_file.replace("+", ":").split(
3072                            ":"
3073                        )
3074
3075                        for annotation_file in annotation_file_list:
3076
3077                            if annotation_file:
3078
3079                                # Annotation tool initial
3080                                annotation_tool = annotation_tool_initial
3081
3082                                # Find file
3083                                annotation_file_found = None
3084
3085                                # Expand user
3086                                annotation_file = full_path(annotation_file)
3087
3088                                if os.path.exists(annotation_file):
3089                                    annotation_file_found = annotation_file
3090
3091                                else:
3092                                    # Find within assembly folders
3093                                    for annotations_database in annotations_databases:
3094                                        found_files = find_all(
3095                                            annotation_file,
3096                                            os.path.join(
3097                                                annotations_database, assembly
3098                                            ),
3099                                        )
3100                                        if len(found_files) > 0:
3101                                            annotation_file_found = found_files[0]
3102                                            break
3103                                    if not annotation_file_found and not assembly:
3104                                        # Find within folders
3105                                        for (
3106                                            annotations_database
3107                                        ) in annotations_databases:
3108                                            found_files = find_all(
3109                                                annotation_file, annotations_database
3110                                            )
3111                                            if len(found_files) > 0:
3112                                                annotation_file_found = found_files[0]
3113                                                break
3114                                log.debug(
3115                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3116                                )
3117
3118                                # Full path
3119                                annotation_file_found = full_path(annotation_file_found)
3120
3121                                if annotation_file_found:
3122
3123                                    database = Database(database=annotation_file_found)
3124                                    quick_annotation_format = database.get_format()
3125                                    quick_annotation_is_compressed = (
3126                                        database.is_compressed()
3127                                    )
3128                                    quick_annotation_is_indexed = os.path.exists(
3129                                        f"{annotation_file_found}.tbi"
3130                                    )
3131                                    bcftools_preference = False
3132
3133                                    # Check Annotation Tool
3134                                    if not annotation_tool:
3135                                        if (
3136                                            bcftools_preference
3137                                            and quick_annotation_format
3138                                            in ["vcf", "bed"]
3139                                            and quick_annotation_is_compressed
3140                                            and quick_annotation_is_indexed
3141                                        ):
3142                                            annotation_tool = "bcftools"
3143                                        elif quick_annotation_format in [
3144                                            "vcf",
3145                                            "bed",
3146                                            "tsv",
3147                                            "tsv",
3148                                            "csv",
3149                                            "json",
3150                                            "tbl",
3151                                            "parquet",
3152                                            "duckdb",
3153                                        ]:
3154                                            annotation_tool = "parquet"
3155                                        else:
3156                                            log.error(
3157                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3158                                            )
3159                                            raise ValueError(
3160                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3161                                            )
3162
3163                                    log.debug(
3164                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3165                                    )
3166
3167                                    # Annotation Tool dispatch
3168                                    if annotation_tool:
3169                                        if annotation_tool not in param["annotation"]:
3170                                            param["annotation"][annotation_tool] = {}
3171                                        if (
3172                                            "annotations"
3173                                            not in param["annotation"][annotation_tool]
3174                                        ):
3175                                            param["annotation"][annotation_tool][
3176                                                "annotations"
3177                                            ] = {}
3178                                        param["annotation"][annotation_tool][
3179                                            "annotations"
3180                                        ][annotation_file_found] = annotations
3181
3182                                else:
3183                                    log.error(
3184                                        f"Quick Annotation File {annotation_file} does NOT exist"
3185                                    )
3186
3187                self.set_param(param)
3188
3189        if param.get("annotation", None):
3190            log.info("Annotations")
3191            if param.get("annotation", {}).get("parquet", None):
3192                log.info("Annotations 'parquet'...")
3193                self.annotation_parquet()
3194            if param.get("annotation", {}).get("bcftools", None):
3195                log.info("Annotations 'bcftools'...")
3196                self.annotation_bcftools()
3197            if param.get("annotation", {}).get("snpsift", None):
3198                log.info("Annotations 'snpsift'...")
3199                self.annotation_snpsift()
3200            if param.get("annotation", {}).get("annovar", None):
3201                log.info("Annotations 'annovar'...")
3202                self.annotation_annovar()
3203            if param.get("annotation", {}).get("snpeff", None):
3204                log.info("Annotations 'snpeff'...")
3205                self.annotation_snpeff()
3206            if param.get("annotation", {}).get("exomiser", None) is not None:
3207                log.info("Annotations 'exomiser'...")
3208                self.annotation_exomiser()
3209            if param.get("annotation", {}).get("splice", None) is not None:
3210                log.info("Annotations 'splice' ...")
3211                self.annotation_splice()
3212
3213        # Explode INFOS fields into table fields
3214        if self.get_explode_infos():
3215            self.explode_infos(
3216                prefix=self.get_explode_infos_prefix(),
3217                fields=self.get_explode_infos_fields(),
3218                force=True,
3219            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3221    def annotation_snpsift(self, threads: int = None) -> None:
3222        """
3223        This function annotate with bcftools
3224
3225        :param threads: Number of threads to use
3226        :return: the value of the variable "return_value".
3227        """
3228
3229        # DEBUG
3230        log.debug("Start annotation with bcftools databases")
3231
3232        # Threads
3233        if not threads:
3234            threads = self.get_threads()
3235        log.debug("Threads: " + str(threads))
3236
3237        # Config
3238        config = self.get_config()
3239        log.debug("Config: " + str(config))
3240
3241        # Config - snpSift
3242        snpsift_bin_command = get_bin_command(
3243            bin="SnpSift.jar",
3244            tool="snpsift",
3245            bin_type="jar",
3246            config=config,
3247            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3248        )
3249        if not snpsift_bin_command:
3250            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3251            log.error(msg_err)
3252            raise ValueError(msg_err)
3253
3254        # Config - bcftools
3255        bcftools_bin_command = get_bin_command(
3256            bin="bcftools",
3257            tool="bcftools",
3258            bin_type="bin",
3259            config=config,
3260            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3261        )
3262        if not bcftools_bin_command:
3263            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3264            log.error(msg_err)
3265            raise ValueError(msg_err)
3266
3267        # Config - BCFTools databases folders
3268        databases_folders = set(
3269            self.get_config()
3270            .get("folders", {})
3271            .get("databases", {})
3272            .get("annotations", ["."])
3273            + self.get_config()
3274            .get("folders", {})
3275            .get("databases", {})
3276            .get("bcftools", ["."])
3277        )
3278        log.debug("Databases annotations: " + str(databases_folders))
3279
3280        # Param
3281        annotations = (
3282            self.get_param()
3283            .get("annotation", {})
3284            .get("snpsift", {})
3285            .get("annotations", None)
3286        )
3287        log.debug("Annotations: " + str(annotations))
3288
3289        # Assembly
3290        assembly = self.get_param().get(
3291            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3292        )
3293
3294        # Data
3295        table_variants = self.get_table_variants()
3296
3297        # Check if not empty
3298        log.debug("Check if not empty")
3299        sql_query_chromosomes = (
3300            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3301        )
3302        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3303        if not sql_query_chromosomes_df["count"][0]:
3304            log.info(f"VCF empty")
3305            return
3306
3307        # VCF header
3308        vcf_reader = self.get_header()
3309        log.debug("Initial header: " + str(vcf_reader.infos))
3310
3311        # Existing annotations
3312        for vcf_annotation in self.get_header().infos:
3313
3314            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3315            log.debug(
3316                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3317            )
3318
3319        if annotations:
3320
3321            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3322
3323                # Export VCF file
3324                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3325
3326                # Init
3327                commands = {}
3328
3329                for annotation in annotations:
3330                    annotation_fields = annotations[annotation]
3331
3332                    # Annotation Name
3333                    annotation_name = os.path.basename(annotation)
3334
3335                    if not annotation_fields:
3336                        annotation_fields = {"INFO": None}
3337
3338                    log.debug(f"Annotation '{annotation_name}'")
3339                    log.debug(
3340                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3341                    )
3342
3343                    # Create Database
3344                    database = Database(
3345                        database=annotation,
3346                        databases_folders=databases_folders,
3347                        assembly=assembly,
3348                    )
3349
3350                    # Find files
3351                    db_file = database.get_database()
3352                    db_file = full_path(db_file)
3353                    db_hdr_file = database.get_header_file()
3354                    db_hdr_file = full_path(db_hdr_file)
3355                    db_file_type = database.get_format()
3356                    db_tbi_file = f"{db_file}.tbi"
3357                    db_file_compressed = database.is_compressed()
3358
3359                    # Check if compressed
3360                    if not db_file_compressed:
3361                        log.error(
3362                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3363                        )
3364                        raise ValueError(
3365                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3366                        )
3367
3368                    # Check if indexed
3369                    if not os.path.exists(db_tbi_file):
3370                        log.error(
3371                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3372                        )
3373                        raise ValueError(
3374                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3375                        )
3376
3377                    # Check index - try to create if not exists
3378                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3379                        log.error("Annotation failed: database not valid")
3380                        log.error(f"Annotation annotation file: {db_file}")
3381                        log.error(f"Annotation annotation header: {db_hdr_file}")
3382                        log.error(f"Annotation annotation index: {db_tbi_file}")
3383                        raise ValueError(
3384                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3385                        )
3386                    else:
3387
3388                        log.debug(
3389                            f"Annotation '{annotation}' - file: "
3390                            + str(db_file)
3391                            + " and "
3392                            + str(db_hdr_file)
3393                        )
3394
3395                        # Load header as VCF object
3396                        db_hdr_vcf = Variants(input=db_hdr_file)
3397                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3398                        log.debug(
3399                            "Annotation database header: "
3400                            + str(db_hdr_vcf_header_infos)
3401                        )
3402
3403                        # For all fields in database
3404                        annotation_fields_full = False
3405                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3406                            annotation_fields = {
3407                                key: key for key in db_hdr_vcf_header_infos
3408                            }
3409                            log.debug(
3410                                "Annotation database header - All annotations added: "
3411                                + str(annotation_fields)
3412                            )
3413                            annotation_fields_full = True
3414
3415                        # # Create file for field rename
3416                        # log.debug("Create file for field rename")
3417                        # tmp_rename = NamedTemporaryFile(
3418                        #     prefix=self.get_prefix(),
3419                        #     dir=self.get_tmp_dir(),
3420                        #     suffix=".rename",
3421                        #     delete=False,
3422                        # )
3423                        # tmp_rename_name = tmp_rename.name
3424                        # tmp_files.append(tmp_rename_name)
3425
3426                        # Number of fields
3427                        nb_annotation_field = 0
3428                        annotation_list = []
3429                        annotation_infos_rename_list = []
3430
3431                        for annotation_field in annotation_fields:
3432
3433                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3434                            annotation_fields_new_name = annotation_fields.get(
3435                                annotation_field, annotation_field
3436                            )
3437                            if not annotation_fields_new_name:
3438                                annotation_fields_new_name = annotation_field
3439
3440                            # Check if field is in DB and if field is not elready in input data
3441                            if (
3442                                annotation_field in db_hdr_vcf.get_header().infos
3443                                and annotation_fields_new_name
3444                                not in self.get_header().infos
3445                            ):
3446
3447                                log.info(
3448                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3449                                )
3450
3451                                # BCFTools annotate param to rename fields
3452                                if annotation_field != annotation_fields_new_name:
3453                                    annotation_infos_rename_list.append(
3454                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3455                                    )
3456
3457                                # Add INFO field to header
3458                                db_hdr_vcf_header_infos_number = (
3459                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3460                                )
3461                                db_hdr_vcf_header_infos_type = (
3462                                    db_hdr_vcf_header_infos[annotation_field].type
3463                                    or "String"
3464                                )
3465                                db_hdr_vcf_header_infos_description = (
3466                                    db_hdr_vcf_header_infos[annotation_field].desc
3467                                    or f"{annotation_field} description"
3468                                )
3469                                db_hdr_vcf_header_infos_source = (
3470                                    db_hdr_vcf_header_infos[annotation_field].source
3471                                    or "unknown"
3472                                )
3473                                db_hdr_vcf_header_infos_version = (
3474                                    db_hdr_vcf_header_infos[annotation_field].version
3475                                    or "unknown"
3476                                )
3477
3478                                vcf_reader.infos[annotation_fields_new_name] = (
3479                                    vcf.parser._Info(
3480                                        annotation_fields_new_name,
3481                                        db_hdr_vcf_header_infos_number,
3482                                        db_hdr_vcf_header_infos_type,
3483                                        db_hdr_vcf_header_infos_description,
3484                                        db_hdr_vcf_header_infos_source,
3485                                        db_hdr_vcf_header_infos_version,
3486                                        self.code_type_map[
3487                                            db_hdr_vcf_header_infos_type
3488                                        ],
3489                                    )
3490                                )
3491
3492                                annotation_list.append(annotation_field)
3493
3494                                nb_annotation_field += 1
3495
3496                            else:
3497
3498                                if (
3499                                    annotation_field
3500                                    not in db_hdr_vcf.get_header().infos
3501                                ):
3502                                    log.warning(
3503                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3504                                    )
3505                                if (
3506                                    annotation_fields_new_name
3507                                    in self.get_header().infos
3508                                ):
3509                                    log.warning(
3510                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3511                                    )
3512
3513                        log.info(
3514                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3515                        )
3516
3517                        annotation_infos = ",".join(annotation_list)
3518
3519                        if annotation_infos != "":
3520
3521                            # Annotated VCF (and error file)
3522                            tmp_annotation_vcf_name = os.path.join(
3523                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3524                            )
3525                            tmp_annotation_vcf_name_err = (
3526                                tmp_annotation_vcf_name + ".err"
3527                            )
3528
3529                            # Add fields to annotate
3530                            if not annotation_fields_full:
3531                                annotation_infos_option = f"-info {annotation_infos}"
3532                            else:
3533                                annotation_infos_option = ""
3534
3535                            # Info fields rename
3536                            if annotation_infos_rename_list:
3537                                annotation_infos_rename = " -c " + ",".join(
3538                                    annotation_infos_rename_list
3539                                )
3540                            else:
3541                                annotation_infos_rename = ""
3542
3543                            # Annotate command
3544                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3545
3546                            # Add command
3547                            commands[command_annotate] = tmp_annotation_vcf_name
3548
3549                if commands:
3550
3551                    # Export VCF file
3552                    self.export_variant_vcf(
3553                        vcf_file=tmp_vcf_name,
3554                        remove_info=True,
3555                        add_samples=False,
3556                        index=True,
3557                    )
3558                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3559
3560                    # Num command
3561                    nb_command = 0
3562
3563                    # Annotate
3564                    for command_annotate in commands:
3565                        nb_command += 1
3566                        log.info(
3567                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3568                        )
3569                        log.debug(f"command_annotate={command_annotate}")
3570                        run_parallel_commands([command_annotate], threads)
3571
3572                        # Debug
3573                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3574
3575                        # Update variants
3576                        log.info(
3577                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3578                        )
3579                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3581    def annotation_bcftools(self, threads: int = None) -> None:
3582        """
3583        This function annotate with bcftools
3584
3585        :param threads: Number of threads to use
3586        :return: the value of the variable "return_value".
3587        """
3588
3589        # DEBUG
3590        log.debug("Start annotation with bcftools databases")
3591
3592        # Threads
3593        if not threads:
3594            threads = self.get_threads()
3595        log.debug("Threads: " + str(threads))
3596
3597        # Config
3598        config = self.get_config()
3599        log.debug("Config: " + str(config))
3600
3601        # DEBUG
3602        delete_tmp = True
3603        if self.get_config().get("verbosity", "warning") in ["debug"]:
3604            delete_tmp = False
3605            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3606
3607        # Config - BCFTools bin command
3608        bcftools_bin_command = get_bin_command(
3609            bin="bcftools",
3610            tool="bcftools",
3611            bin_type="bin",
3612            config=config,
3613            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3614        )
3615        if not bcftools_bin_command:
3616            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3617            log.error(msg_err)
3618            raise ValueError(msg_err)
3619
3620        # Config - BCFTools databases folders
3621        databases_folders = set(
3622            self.get_config()
3623            .get("folders", {})
3624            .get("databases", {})
3625            .get("annotations", ["."])
3626            + self.get_config()
3627            .get("folders", {})
3628            .get("databases", {})
3629            .get("bcftools", ["."])
3630        )
3631        log.debug("Databases annotations: " + str(databases_folders))
3632
3633        # Param
3634        annotations = (
3635            self.get_param()
3636            .get("annotation", {})
3637            .get("bcftools", {})
3638            .get("annotations", None)
3639        )
3640        log.debug("Annotations: " + str(annotations))
3641
3642        # Assembly
3643        assembly = self.get_param().get(
3644            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3645        )
3646
3647        # Data
3648        table_variants = self.get_table_variants()
3649
3650        # Check if not empty
3651        log.debug("Check if not empty")
3652        sql_query_chromosomes = (
3653            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3654        )
3655        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3656        if not sql_query_chromosomes_df["count"][0]:
3657            log.info(f"VCF empty")
3658            return
3659
3660        # Export in VCF
3661        log.debug("Create initial file to annotate")
3662        tmp_vcf = NamedTemporaryFile(
3663            prefix=self.get_prefix(),
3664            dir=self.get_tmp_dir(),
3665            suffix=".vcf.gz",
3666            delete=False,
3667        )
3668        tmp_vcf_name = tmp_vcf.name
3669
3670        # VCF header
3671        vcf_reader = self.get_header()
3672        log.debug("Initial header: " + str(vcf_reader.infos))
3673
3674        # Existing annotations
3675        for vcf_annotation in self.get_header().infos:
3676
3677            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3678            log.debug(
3679                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3680            )
3681
3682        if annotations:
3683
3684            tmp_ann_vcf_list = []
3685            commands = []
3686            tmp_files = []
3687            err_files = []
3688
3689            for annotation in annotations:
3690                annotation_fields = annotations[annotation]
3691
3692                # Annotation Name
3693                annotation_name = os.path.basename(annotation)
3694
3695                if not annotation_fields:
3696                    annotation_fields = {"INFO": None}
3697
3698                log.debug(f"Annotation '{annotation_name}'")
3699                log.debug(
3700                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3701                )
3702
3703                # Create Database
3704                database = Database(
3705                    database=annotation,
3706                    databases_folders=databases_folders,
3707                    assembly=assembly,
3708                )
3709
3710                # Find files
3711                db_file = database.get_database()
3712                db_file = full_path(db_file)
3713                db_hdr_file = database.get_header_file()
3714                db_hdr_file = full_path(db_hdr_file)
3715                db_file_type = database.get_format()
3716                db_tbi_file = f"{db_file}.tbi"
3717                db_file_compressed = database.is_compressed()
3718
3719                # Check if compressed
3720                if not db_file_compressed:
3721                    log.error(
3722                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3723                    )
3724                    raise ValueError(
3725                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3726                    )
3727
3728                # Check if indexed
3729                if not os.path.exists(db_tbi_file):
3730                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3731                    raise ValueError(
3732                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3733                    )
3734
3735                # Check index - try to create if not exists
3736                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3737                    log.error("Annotation failed: database not valid")
3738                    log.error(f"Annotation annotation file: {db_file}")
3739                    log.error(f"Annotation annotation header: {db_hdr_file}")
3740                    log.error(f"Annotation annotation index: {db_tbi_file}")
3741                    raise ValueError(
3742                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3743                    )
3744                else:
3745
3746                    log.debug(
3747                        f"Annotation '{annotation}' - file: "
3748                        + str(db_file)
3749                        + " and "
3750                        + str(db_hdr_file)
3751                    )
3752
3753                    # Load header as VCF object
3754                    db_hdr_vcf = Variants(input=db_hdr_file)
3755                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3756                    log.debug(
3757                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3758                    )
3759
3760                    # For all fields in database
3761                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3762                        annotation_fields = {
3763                            key: key for key in db_hdr_vcf_header_infos
3764                        }
3765                        log.debug(
3766                            "Annotation database header - All annotations added: "
3767                            + str(annotation_fields)
3768                        )
3769
3770                    # Number of fields
3771                    nb_annotation_field = 0
3772                    annotation_list = []
3773
3774                    for annotation_field in annotation_fields:
3775
3776                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3777                        annotation_fields_new_name = annotation_fields.get(
3778                            annotation_field, annotation_field
3779                        )
3780                        if not annotation_fields_new_name:
3781                            annotation_fields_new_name = annotation_field
3782
3783                        # Check if field is in DB and if field is not elready in input data
3784                        if (
3785                            annotation_field in db_hdr_vcf.get_header().infos
3786                            and annotation_fields_new_name
3787                            not in self.get_header().infos
3788                        ):
3789
3790                            log.info(
3791                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3792                            )
3793
3794                            # Add INFO field to header
3795                            db_hdr_vcf_header_infos_number = (
3796                                db_hdr_vcf_header_infos[annotation_field].num or "."
3797                            )
3798                            db_hdr_vcf_header_infos_type = (
3799                                db_hdr_vcf_header_infos[annotation_field].type
3800                                or "String"
3801                            )
3802                            db_hdr_vcf_header_infos_description = (
3803                                db_hdr_vcf_header_infos[annotation_field].desc
3804                                or f"{annotation_field} description"
3805                            )
3806                            db_hdr_vcf_header_infos_source = (
3807                                db_hdr_vcf_header_infos[annotation_field].source
3808                                or "unknown"
3809                            )
3810                            db_hdr_vcf_header_infos_version = (
3811                                db_hdr_vcf_header_infos[annotation_field].version
3812                                or "unknown"
3813                            )
3814
3815                            vcf_reader.infos[annotation_fields_new_name] = (
3816                                vcf.parser._Info(
3817                                    annotation_fields_new_name,
3818                                    db_hdr_vcf_header_infos_number,
3819                                    db_hdr_vcf_header_infos_type,
3820                                    db_hdr_vcf_header_infos_description,
3821                                    db_hdr_vcf_header_infos_source,
3822                                    db_hdr_vcf_header_infos_version,
3823                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3824                                )
3825                            )
3826
3827                            # annotation_list.append(annotation_field)
3828                            if annotation_field != annotation_fields_new_name:
3829                                annotation_list.append(
3830                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3831                                )
3832                            else:
3833                                annotation_list.append(annotation_field)
3834
3835                            nb_annotation_field += 1
3836
3837                        else:
3838
3839                            if annotation_field not in db_hdr_vcf.get_header().infos:
3840                                log.warning(
3841                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3842                                )
3843                            if annotation_fields_new_name in self.get_header().infos:
3844                                log.warning(
3845                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3846                                )
3847
3848                    log.info(
3849                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3850                    )
3851
3852                    annotation_infos = ",".join(annotation_list)
3853
3854                    if annotation_infos != "":
3855
3856                        # Protect header for bcftools (remove "#CHROM" and variants line)
3857                        log.debug("Protect Header file - remove #CHROM line if exists")
3858                        tmp_header_vcf = NamedTemporaryFile(
3859                            prefix=self.get_prefix(),
3860                            dir=self.get_tmp_dir(),
3861                            suffix=".hdr",
3862                            delete=False,
3863                        )
3864                        tmp_header_vcf_name = tmp_header_vcf.name
3865                        tmp_files.append(tmp_header_vcf_name)
3866                        # Command
3867                        if db_hdr_file.endswith(".gz"):
3868                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3869                        else:
3870                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3871                        # Run
3872                        run_parallel_commands([command_extract_header], 1)
3873
3874                        # Find chomosomes
3875                        log.debug("Find chromosomes ")
3876                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3877                        sql_query_chromosomes_df = self.get_query_to_df(
3878                            sql_query_chromosomes
3879                        )
3880                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3881
3882                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3883
3884                        # BED columns in the annotation file
3885                        if db_file_type in ["bed"]:
3886                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3887
3888                        for chrom in chomosomes_list:
3889
3890                            # Create BED on initial VCF
3891                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3892                            tmp_bed = NamedTemporaryFile(
3893                                prefix=self.get_prefix(),
3894                                dir=self.get_tmp_dir(),
3895                                suffix=".bed",
3896                                delete=False,
3897                            )
3898                            tmp_bed_name = tmp_bed.name
3899                            tmp_files.append(tmp_bed_name)
3900
3901                            # Detecte regions
3902                            log.debug(
3903                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3904                            )
3905                            window = 1000000
3906                            sql_query_intervals_for_bed = f"""
3907                                SELECT  \"#CHROM\",
3908                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3909                                        \"POS\"+{window}
3910                                FROM {table_variants} as table_variants
3911                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3912                            """
3913                            regions = self.conn.execute(
3914                                sql_query_intervals_for_bed
3915                            ).fetchall()
3916                            merged_regions = merge_regions(regions)
3917                            log.debug(
3918                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3919                            )
3920
3921                            header = ["#CHROM", "START", "END"]
3922                            with open(tmp_bed_name, "w") as f:
3923                                # Write the header with tab delimiter
3924                                f.write("\t".join(header) + "\n")
3925                                for d in merged_regions:
3926                                    # Write each data row with tab delimiter
3927                                    f.write("\t".join(map(str, d)) + "\n")
3928
3929                            # Tmp files
3930                            tmp_annotation_vcf = NamedTemporaryFile(
3931                                prefix=self.get_prefix(),
3932                                dir=self.get_tmp_dir(),
3933                                suffix=".vcf.gz",
3934                                delete=False,
3935                            )
3936                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3937                            tmp_files.append(tmp_annotation_vcf_name)
3938                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3939                            tmp_annotation_vcf_name_err = (
3940                                tmp_annotation_vcf_name + ".err"
3941                            )
3942                            err_files.append(tmp_annotation_vcf_name_err)
3943
3944                            # Annotate Command
3945                            log.debug(
3946                                f"Annotation '{annotation}' - add bcftools command"
3947                            )
3948
3949                            # Command
3950                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3951
3952                            # Add command
3953                            commands.append(command_annotate)
3954
3955            # if some commands
3956            if commands:
3957
3958                # Export VCF file
3959                self.export_variant_vcf(
3960                    vcf_file=tmp_vcf_name,
3961                    remove_info=True,
3962                    add_samples=False,
3963                    index=True,
3964                )
3965
3966                # Threads
3967                # calculate threads for annotated commands
3968                if commands:
3969                    threads_bcftools_annotate = round(threads / len(commands))
3970                else:
3971                    threads_bcftools_annotate = 1
3972
3973                if not threads_bcftools_annotate:
3974                    threads_bcftools_annotate = 1
3975
3976                # Add threads option to bcftools commands
3977                if threads_bcftools_annotate > 1:
3978                    commands_threaded = []
3979                    for command in commands:
3980                        commands_threaded.append(
3981                            command.replace(
3982                                f"{bcftools_bin_command} annotate ",
3983                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3984                            )
3985                        )
3986                    commands = commands_threaded
3987
3988                # Command annotation multithreading
3989                log.debug(f"Annotation - Annotation commands: " + str(commands))
3990                log.info(
3991                    f"Annotation - Annotation multithreaded in "
3992                    + str(len(commands))
3993                    + " commands"
3994                )
3995
3996                run_parallel_commands(commands, threads)
3997
3998                # Merge
3999                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4000
4001                if tmp_ann_vcf_list_cmd:
4002
4003                    # Tmp file
4004                    tmp_annotate_vcf = NamedTemporaryFile(
4005                        prefix=self.get_prefix(),
4006                        dir=self.get_tmp_dir(),
4007                        suffix=".vcf.gz",
4008                        delete=True,
4009                    )
4010                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4011                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4012                    err_files.append(tmp_annotate_vcf_name_err)
4013
4014                    # Tmp file remove command
4015                    tmp_files_remove_command = ""
4016                    if tmp_files:
4017                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4018
4019                    # Command merge
4020                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4021                    log.info(
4022                        f"Annotation - Annotation merging "
4023                        + str(len(commands))
4024                        + " annotated files"
4025                    )
4026                    log.debug(f"Annotation - merge command: {merge_command}")
4027                    run_parallel_commands([merge_command], 1)
4028
4029                    # Error messages
4030                    log.info(f"Error/Warning messages:")
4031                    error_message_command_all = []
4032                    error_message_command_warning = []
4033                    error_message_command_err = []
4034                    for err_file in err_files:
4035                        with open(err_file, "r") as f:
4036                            for line in f:
4037                                message = line.strip()
4038                                error_message_command_all.append(message)
4039                                if line.startswith("[W::"):
4040                                    error_message_command_warning.append(message)
4041                                if line.startswith("[E::"):
4042                                    error_message_command_err.append(
4043                                        f"{err_file}: " + message
4044                                    )
4045                    # log info
4046                    for message in list(
4047                        set(error_message_command_err + error_message_command_warning)
4048                    ):
4049                        log.info(f"   {message}")
4050                    # debug info
4051                    for message in list(set(error_message_command_all)):
4052                        log.debug(f"   {message}")
4053                    # failed
4054                    if len(error_message_command_err):
4055                        log.error("Annotation failed: Error in commands")
4056                        raise ValueError("Annotation failed: Error in commands")
4057
4058                    # Update variants
4059                    log.info(f"Annotation - Updating...")
4060                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4062    def annotation_exomiser(self, threads: int = None) -> None:
4063        """
4064        This function annotate with Exomiser
4065
4066        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4067        - "analysis" (dict/file):
4068            Full analysis dictionnary parameters (see Exomiser docs).
4069            Either a dict, or a file in JSON or YAML format.
4070            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4071            Default : None
4072        - "preset" (string):
4073            Analysis preset (available in config folder).
4074            Used if no full "analysis" is provided.
4075            Default: "exome"
4076        - "phenopacket" (dict/file):
4077            Samples and phenotipic features parameters (see Exomiser docs).
4078            Either a dict, or a file in JSON or YAML format.
4079            Default: None
4080        - "subject" (dict):
4081            Sample parameters (see Exomiser docs).
4082            Example:
4083                "subject":
4084                    {
4085                        "id": "ISDBM322017",
4086                        "sex": "FEMALE"
4087                    }
4088            Default: None
4089        - "sample" (string):
4090            Sample name to construct "subject" section:
4091                "subject":
4092                    {
4093                        "id": "<sample>",
4094                        "sex": "UNKNOWN_SEX"
4095                    }
4096            Default: None
4097        - "phenotypicFeatures" (dict)
4098            Phenotypic features to construct "subject" section.
4099            Example:
4100                "phenotypicFeatures":
4101                    [
4102                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4103                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4104                    ]
4105        - "hpo" (list)
4106            List of HPO ids as phenotypic features.
4107            Example:
4108                "hpo": ['0001156', '0001363', '0011304', '0010055']
4109            Default: []
4110        - "outputOptions" (dict):
4111            Output options (see Exomiser docs).
4112            Default:
4113                "output_options" =
4114                    {
4115                        "outputContributingVariantsOnly": False,
4116                        "numGenes": 0,
4117                        "outputFormats": ["TSV_VARIANT", "VCF"]
4118                    }
4119        - "transcript_source" (string):
4120            Transcript source (either "refseq", "ucsc", "ensembl")
4121            Default: "refseq"
4122        - "exomiser_to_info" (boolean):
4123            Add exomiser TSV file columns as INFO fields in VCF.
4124            Default: False
4125        - "release" (string):
4126            Exomise database release.
4127            If not exists, database release will be downloaded (take a while).
4128            Default: None (provided by application.properties configuration file)
4129        - "exomiser_application_properties" (file):
4130            Exomiser configuration file (see Exomiser docs).
4131            Useful to automatically download databases (especially for specific genome databases).
4132
4133        Notes:
4134        - If no sample in parameters, first sample in VCF will be chosen
4135        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4136
4137        :param threads: The number of threads to use
4138        :return: None.
4139        """
4140
4141        # DEBUG
4142        log.debug("Start annotation with Exomiser databases")
4143
4144        # Threads
4145        if not threads:
4146            threads = self.get_threads()
4147        log.debug("Threads: " + str(threads))
4148
4149        # Config
4150        config = self.get_config()
4151        log.debug("Config: " + str(config))
4152
4153        # Config - Folders - Databases
4154        databases_folders = (
4155            config.get("folders", {})
4156            .get("databases", {})
4157            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4158        )
4159        databases_folders = full_path(databases_folders)
4160        if not os.path.exists(databases_folders):
4161            log.error(f"Databases annotations: {databases_folders} NOT found")
4162        log.debug("Databases annotations: " + str(databases_folders))
4163
4164        # Config - Exomiser
4165        exomiser_bin_command = get_bin_command(
4166            bin="exomiser-cli*.jar",
4167            tool="exomiser",
4168            bin_type="jar",
4169            config=config,
4170            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4171        )
4172        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4173        if not exomiser_bin_command:
4174            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4175            log.error(msg_err)
4176            raise ValueError(msg_err)
4177
4178        # Param
4179        param = self.get_param()
4180        log.debug("Param: " + str(param))
4181
4182        # Param - Exomiser
4183        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4184        log.debug(f"Param Exomiser: {param_exomiser}")
4185
4186        # Param - Assembly
4187        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4188        log.debug("Assembly: " + str(assembly))
4189
4190        # Data
4191        table_variants = self.get_table_variants()
4192
4193        # Check if not empty
4194        log.debug("Check if not empty")
4195        sql_query_chromosomes = (
4196            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4197        )
4198        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4199            log.info(f"VCF empty")
4200            return False
4201
4202        # VCF header
4203        vcf_reader = self.get_header()
4204        log.debug("Initial header: " + str(vcf_reader.infos))
4205
4206        # Samples
4207        samples = self.get_header_sample_list()
4208        if not samples:
4209            log.error("No Samples in VCF")
4210            return False
4211        log.debug(f"Samples: {samples}")
4212
4213        # Memory limit
4214        memory_limit = self.get_memory("8G")
4215        log.debug(f"memory_limit: {memory_limit}")
4216
4217        # Exomiser java options
4218        exomiser_java_options = (
4219            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4220        )
4221        log.debug(f"Exomiser java options: {exomiser_java_options}")
4222
4223        # Download Exomiser (if not exists)
4224        exomiser_release = param_exomiser.get("release", None)
4225        exomiser_application_properties = param_exomiser.get(
4226            "exomiser_application_properties", None
4227        )
4228        databases_download_exomiser(
4229            assemblies=[assembly],
4230            exomiser_folder=databases_folders,
4231            exomiser_release=exomiser_release,
4232            exomiser_phenotype_release=exomiser_release,
4233            exomiser_application_properties=exomiser_application_properties,
4234        )
4235
4236        # Force annotation
4237        force_update_annotation = True
4238
4239        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4240            log.debug("Start annotation Exomiser")
4241
4242            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4243
4244                # tmp_dir = "/tmp/exomiser"
4245
4246                ### ANALYSIS ###
4247                ################
4248
4249                # Create analysis.json through analysis dict
4250                # either analysis in param or by default
4251                # depending on preset exome/genome)
4252
4253                # Init analysis dict
4254                param_exomiser_analysis_dict = {}
4255
4256                # analysis from param
4257                param_exomiser_analysis = param_exomiser.get("analysis", {})
4258                param_exomiser_analysis = full_path(param_exomiser_analysis)
4259
4260                # If analysis in param -> load anlaysis json
4261                if param_exomiser_analysis:
4262
4263                    # If param analysis is a file and exists
4264                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4265                        param_exomiser_analysis
4266                    ):
4267                        # Load analysis file into analysis dict (either yaml or json)
4268                        with open(param_exomiser_analysis) as json_file:
4269                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4270
4271                    # If param analysis is a dict
4272                    elif isinstance(param_exomiser_analysis, dict):
4273                        # Load analysis dict into analysis dict (either yaml or json)
4274                        param_exomiser_analysis_dict = param_exomiser_analysis
4275
4276                    # Error analysis type
4277                    else:
4278                        log.error(f"Analysis type unknown. Check param file.")
4279                        raise ValueError(f"Analysis type unknown. Check param file.")
4280
4281                # Case no input analysis config file/dict
4282                # Use preset (exome/genome) to open default config file
4283                if not param_exomiser_analysis_dict:
4284
4285                    # default preset
4286                    default_preset = "exome"
4287
4288                    # Get param preset or default preset
4289                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4290
4291                    # Try to find if preset is a file
4292                    if os.path.exists(param_exomiser_preset):
4293                        # Preset file is provided in full path
4294                        param_exomiser_analysis_default_config_file = (
4295                            param_exomiser_preset
4296                        )
4297                    # elif os.path.exists(full_path(param_exomiser_preset)):
4298                    #     # Preset file is provided in full path
4299                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4300                    elif os.path.exists(
4301                        os.path.join(folder_config, param_exomiser_preset)
4302                    ):
4303                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4304                        param_exomiser_analysis_default_config_file = os.path.join(
4305                            folder_config, param_exomiser_preset
4306                        )
4307                    else:
4308                        # Construct preset file
4309                        param_exomiser_analysis_default_config_file = os.path.join(
4310                            folder_config,
4311                            f"preset-{param_exomiser_preset}-analysis.json",
4312                        )
4313
4314                    # If preset file exists
4315                    param_exomiser_analysis_default_config_file = full_path(
4316                        param_exomiser_analysis_default_config_file
4317                    )
4318                    if os.path.exists(param_exomiser_analysis_default_config_file):
4319                        # Load prest file into analysis dict (either yaml or json)
4320                        with open(
4321                            param_exomiser_analysis_default_config_file
4322                        ) as json_file:
4323                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4324                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4325                                json_file
4326                            )
4327
4328                    # Error preset file
4329                    else:
4330                        log.error(
4331                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4332                        )
4333                        raise ValueError(
4334                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4335                        )
4336
4337                # If no analysis dict created
4338                if not param_exomiser_analysis_dict:
4339                    log.error(f"No analysis config")
4340                    raise ValueError(f"No analysis config")
4341
4342                # Log
4343                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4344
4345                ### PHENOPACKET ###
4346                ###################
4347
4348                # If no PhenoPacket in analysis dict -> check in param
4349                if "phenopacket" not in param_exomiser_analysis_dict:
4350
4351                    # If PhenoPacket in param -> load anlaysis json
4352                    if param_exomiser.get("phenopacket", None):
4353
4354                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4355                        param_exomiser_phenopacket = full_path(
4356                            param_exomiser_phenopacket
4357                        )
4358
4359                        # If param phenopacket is a file and exists
4360                        if isinstance(
4361                            param_exomiser_phenopacket, str
4362                        ) and os.path.exists(param_exomiser_phenopacket):
4363                            # Load phenopacket file into analysis dict (either yaml or json)
4364                            with open(param_exomiser_phenopacket) as json_file:
4365                                param_exomiser_analysis_dict["phenopacket"] = (
4366                                    yaml.safe_load(json_file)
4367                                )
4368
4369                        # If param phenopacket is a dict
4370                        elif isinstance(param_exomiser_phenopacket, dict):
4371                            # Load phenopacket dict into analysis dict (either yaml or json)
4372                            param_exomiser_analysis_dict["phenopacket"] = (
4373                                param_exomiser_phenopacket
4374                            )
4375
4376                        # Error phenopacket type
4377                        else:
4378                            log.error(f"Phenopacket type unknown. Check param file.")
4379                            raise ValueError(
4380                                f"Phenopacket type unknown. Check param file."
4381                            )
4382
4383                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4384                if "phenopacket" not in param_exomiser_analysis_dict:
4385
4386                    # Init PhenoPacket
4387                    param_exomiser_analysis_dict["phenopacket"] = {
4388                        "id": "analysis",
4389                        "proband": {},
4390                    }
4391
4392                    ### Add subject ###
4393
4394                    # If subject exists
4395                    param_exomiser_subject = param_exomiser.get("subject", {})
4396
4397                    # If subject not exists -> found sample ID
4398                    if not param_exomiser_subject:
4399
4400                        # Found sample ID in param
4401                        sample = param_exomiser.get("sample", None)
4402
4403                        # Find sample ID (first sample)
4404                        if not sample:
4405                            sample_list = self.get_header_sample_list()
4406                            if len(sample_list) > 0:
4407                                sample = sample_list[0]
4408                            else:
4409                                log.error(f"No sample found")
4410                                raise ValueError(f"No sample found")
4411
4412                        # Create subject
4413                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4414
4415                    # Add to dict
4416                    param_exomiser_analysis_dict["phenopacket"][
4417                        "subject"
4418                    ] = param_exomiser_subject
4419
4420                    ### Add "phenotypicFeatures" ###
4421
4422                    # If phenotypicFeatures exists
4423                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4424                        "phenotypicFeatures", []
4425                    )
4426
4427                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4428                    if not param_exomiser_phenotypicfeatures:
4429
4430                        # Found HPO in param
4431                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4432
4433                        # Split HPO if list in string format separated by comma
4434                        if isinstance(param_exomiser_hpo, str):
4435                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4436
4437                        # Create HPO list
4438                        for hpo in param_exomiser_hpo:
4439                            hpo_clean = re.sub("[^0-9]", "", hpo)
4440                            param_exomiser_phenotypicfeatures.append(
4441                                {
4442                                    "type": {
4443                                        "id": f"HP:{hpo_clean}",
4444                                        "label": f"HP:{hpo_clean}",
4445                                    }
4446                                }
4447                            )
4448
4449                    # Add to dict
4450                    param_exomiser_analysis_dict["phenopacket"][
4451                        "phenotypicFeatures"
4452                    ] = param_exomiser_phenotypicfeatures
4453
4454                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4455                    if not param_exomiser_phenotypicfeatures:
4456                        for step in param_exomiser_analysis_dict.get(
4457                            "analysis", {}
4458                        ).get("steps", []):
4459                            if "hiPhivePrioritiser" in step:
4460                                param_exomiser_analysis_dict.get("analysis", {}).get(
4461                                    "steps", []
4462                                ).remove(step)
4463
4464                ### Add Input File ###
4465
4466                # Initial file name and htsFiles
4467                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4468                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4469                    {
4470                        "uri": tmp_vcf_name,
4471                        "htsFormat": "VCF",
4472                        "genomeAssembly": assembly,
4473                    }
4474                ]
4475
4476                ### Add metaData ###
4477
4478                # If metaData not in analysis dict
4479                if "metaData" not in param_exomiser_analysis_dict:
4480                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4481                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4482                        "createdBy": "howard",
4483                        "phenopacketSchemaVersion": 1,
4484                    }
4485
4486                ### OutputOptions ###
4487
4488                # Init output result folder
4489                output_results = os.path.join(tmp_dir, "results")
4490
4491                # If no outputOptions in analysis dict
4492                if "outputOptions" not in param_exomiser_analysis_dict:
4493
4494                    # default output formats
4495                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4496
4497                    # Get outputOptions in param
4498                    output_options = param_exomiser.get("outputOptions", None)
4499
4500                    # If no output_options in param -> check
4501                    if not output_options:
4502                        output_options = {
4503                            "outputContributingVariantsOnly": False,
4504                            "numGenes": 0,
4505                            "outputFormats": defaut_output_formats,
4506                        }
4507
4508                    # Replace outputDirectory in output options
4509                    output_options["outputDirectory"] = output_results
4510                    output_options["outputFileName"] = "howard"
4511
4512                    # Add outputOptions in analysis dict
4513                    param_exomiser_analysis_dict["outputOptions"] = output_options
4514
4515                else:
4516
4517                    # Replace output_results and output format (if exists in param)
4518                    param_exomiser_analysis_dict["outputOptions"][
4519                        "outputDirectory"
4520                    ] = output_results
4521                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4522                        list(
4523                            set(
4524                                param_exomiser_analysis_dict.get(
4525                                    "outputOptions", {}
4526                                ).get("outputFormats", [])
4527                                + ["TSV_VARIANT", "VCF"]
4528                            )
4529                        )
4530                    )
4531
4532                # log
4533                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4534
4535                ### ANALYSIS FILE ###
4536                #####################
4537
4538                ### Full JSON analysis config file ###
4539
4540                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4541                with open(exomiser_analysis, "w") as fp:
4542                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4543
4544                ### SPLIT analysis and sample config files
4545
4546                # Splitted analysis dict
4547                param_exomiser_analysis_dict_for_split = (
4548                    param_exomiser_analysis_dict.copy()
4549                )
4550
4551                # Phenopacket JSON file
4552                exomiser_analysis_phenopacket = os.path.join(
4553                    tmp_dir, "analysis_phenopacket.json"
4554                )
4555                with open(exomiser_analysis_phenopacket, "w") as fp:
4556                    json.dump(
4557                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4558                        fp,
4559                        indent=4,
4560                    )
4561
4562                # Analysis JSON file without Phenopacket parameters
4563                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4564                exomiser_analysis_analysis = os.path.join(
4565                    tmp_dir, "analysis_analysis.json"
4566                )
4567                with open(exomiser_analysis_analysis, "w") as fp:
4568                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4569
4570                ### INITAL VCF file ###
4571                #######################
4572
4573                ### Create list of samples to use and include inti initial VCF file ####
4574
4575                # Subject (main sample)
4576                # Get sample ID in analysis dict
4577                sample_subject = (
4578                    param_exomiser_analysis_dict.get("phenopacket", {})
4579                    .get("subject", {})
4580                    .get("id", None)
4581                )
4582                sample_proband = (
4583                    param_exomiser_analysis_dict.get("phenopacket", {})
4584                    .get("proband", {})
4585                    .get("subject", {})
4586                    .get("id", None)
4587                )
4588                sample = []
4589                if sample_subject:
4590                    sample.append(sample_subject)
4591                if sample_proband:
4592                    sample.append(sample_proband)
4593
4594                # Get sample ID within Pedigree
4595                pedigree_persons_list = (
4596                    param_exomiser_analysis_dict.get("phenopacket", {})
4597                    .get("pedigree", {})
4598                    .get("persons", {})
4599                )
4600
4601                # Create list with all sample ID in pedigree (if exists)
4602                pedigree_persons = []
4603                for person in pedigree_persons_list:
4604                    pedigree_persons.append(person.get("individualId"))
4605
4606                # Concat subject sample ID and samples ID in pedigreesamples
4607                samples = list(set(sample + pedigree_persons))
4608
4609                # Check if sample list is not empty
4610                if not samples:
4611                    log.error(f"No samples found")
4612                    raise ValueError(f"No samples found")
4613
4614                # Create VCF with sample (either sample in param or first one by default)
4615                # Export VCF file
4616                self.export_variant_vcf(
4617                    vcf_file=tmp_vcf_name,
4618                    remove_info=True,
4619                    add_samples=True,
4620                    list_samples=samples,
4621                    index=False,
4622                )
4623
4624                ### Execute Exomiser ###
4625                ########################
4626
4627                # Init command
4628                exomiser_command = ""
4629
4630                # Command exomiser options
4631                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4632
4633                # Release
4634                exomiser_release = param_exomiser.get("release", None)
4635                if exomiser_release:
4636                    # phenotype data version
4637                    exomiser_options += (
4638                        f" --exomiser.phenotype.data-version={exomiser_release} "
4639                    )
4640                    # data version
4641                    exomiser_options += (
4642                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4643                    )
4644                    # variant white list
4645                    variant_white_list_file = (
4646                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4647                    )
4648                    if os.path.exists(
4649                        os.path.join(
4650                            databases_folders, assembly, variant_white_list_file
4651                        )
4652                    ):
4653                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4654
4655                # transcript_source
4656                transcript_source = param_exomiser.get(
4657                    "transcript_source", None
4658                )  # ucsc, refseq, ensembl
4659                if transcript_source:
4660                    exomiser_options += (
4661                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4662                    )
4663
4664                # If analysis contain proband param
4665                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4666                    "proband", {}
4667                ):
4668                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4669
4670                # If no proband (usually uniq sample)
4671                else:
4672                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4673
4674                # Log
4675                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4676
4677                # Run command
4678                result = subprocess.call(
4679                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4680                )
4681                if result:
4682                    log.error("Exomiser command failed")
4683                    raise ValueError("Exomiser command failed")
4684
4685                ### RESULTS ###
4686                ###############
4687
4688                ### Annotate with TSV fields ###
4689
4690                # Init result tsv file
4691                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4692
4693                # Init result tsv file
4694                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4695
4696                # Parse TSV file and explode columns in INFO field
4697                if exomiser_to_info and os.path.exists(output_results_tsv):
4698
4699                    # Log
4700                    log.debug("Exomiser columns to VCF INFO field")
4701
4702                    # Retrieve columns and types
4703                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4704                    output_results_tsv_df = self.get_query_to_df(query)
4705                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4706
4707                    # Init concat fields for update
4708                    sql_query_update_concat_fields = []
4709
4710                    # Fields to avoid
4711                    fields_to_avoid = [
4712                        "CONTIG",
4713                        "START",
4714                        "END",
4715                        "REF",
4716                        "ALT",
4717                        "QUAL",
4718                        "FILTER",
4719                        "GENOTYPE",
4720                    ]
4721
4722                    # List all columns to add into header
4723                    for header_column in output_results_tsv_columns:
4724
4725                        # If header column is enable
4726                        if header_column not in fields_to_avoid:
4727
4728                            # Header info type
4729                            header_info_type = "String"
4730                            header_column_df = output_results_tsv_df[header_column]
4731                            header_column_df_dtype = header_column_df.dtype
4732                            if header_column_df_dtype == object:
4733                                if (
4734                                    pd.to_numeric(header_column_df, errors="coerce")
4735                                    .notnull()
4736                                    .all()
4737                                ):
4738                                    header_info_type = "Float"
4739                            else:
4740                                header_info_type = "Integer"
4741
4742                            # Header info
4743                            characters_to_validate = ["-"]
4744                            pattern = "[" + "".join(characters_to_validate) + "]"
4745                            header_info_name = re.sub(
4746                                pattern,
4747                                "_",
4748                                f"Exomiser_{header_column}".replace("#", ""),
4749                            )
4750                            header_info_number = "."
4751                            header_info_description = (
4752                                f"Exomiser {header_column} annotation"
4753                            )
4754                            header_info_source = "Exomiser"
4755                            header_info_version = "unknown"
4756                            header_info_code = CODE_TYPE_MAP[header_info_type]
4757                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4758                                header_info_name,
4759                                header_info_number,
4760                                header_info_type,
4761                                header_info_description,
4762                                header_info_source,
4763                                header_info_version,
4764                                header_info_code,
4765                            )
4766
4767                            # Add field to add for update to concat fields
4768                            sql_query_update_concat_fields.append(
4769                                f"""
4770                                CASE
4771                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4772                                    THEN concat(
4773                                        '{header_info_name}=',
4774                                        table_parquet."{header_column}",
4775                                        ';'
4776                                        )
4777
4778                                    ELSE ''
4779                                END
4780                            """
4781                            )
4782
4783                    # Update query
4784                    sql_query_update = f"""
4785                        UPDATE {table_variants} as table_variants
4786                            SET INFO = concat(
4787                                            CASE
4788                                                WHEN INFO NOT IN ('', '.')
4789                                                THEN INFO
4790                                                ELSE ''
4791                                            END,
4792                                            CASE
4793                                                WHEN table_variants.INFO NOT IN ('','.')
4794                                                THEN ';'
4795                                                ELSE ''
4796                                            END,
4797                                            (
4798                                            SELECT 
4799                                                concat(
4800                                                    {",".join(sql_query_update_concat_fields)}
4801                                                )
4802                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4803                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4804                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4805                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4806                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4807                                            )
4808                                        )
4809                            ;
4810                        """
4811
4812                    # Update
4813                    self.conn.execute(sql_query_update)
4814
4815                ### Annotate with VCF INFO field ###
4816
4817                # Init result VCF file
4818                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4819
4820                # If VCF exists
4821                if os.path.exists(output_results_vcf):
4822
4823                    # Log
4824                    log.debug("Exomiser result VCF update variants")
4825
4826                    # Find Exomiser INFO field annotation in header
4827                    with gzip.open(output_results_vcf, "rt") as f:
4828                        header_list = self.read_vcf_header(f)
4829                    exomiser_vcf_header = vcf.Reader(
4830                        io.StringIO("\n".join(header_list))
4831                    )
4832
4833                    # Add annotation INFO field to header
4834                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4835
4836                    # Update variants with VCF
4837                    self.update_from_vcf(output_results_vcf)
4838
4839        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4841    def annotation_snpeff(self, threads: int = None) -> None:
4842        """
4843        This function annotate with snpEff
4844
4845        :param threads: The number of threads to use
4846        :return: the value of the variable "return_value".
4847        """
4848
4849        # DEBUG
4850        log.debug("Start annotation with snpeff databases")
4851
4852        # Threads
4853        if not threads:
4854            threads = self.get_threads()
4855        log.debug("Threads: " + str(threads))
4856
4857        # DEBUG
4858        delete_tmp = True
4859        if self.get_config().get("verbosity", "warning") in ["debug"]:
4860            delete_tmp = False
4861            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4862
4863        # Config
4864        config = self.get_config()
4865        log.debug("Config: " + str(config))
4866
4867        # Config - Folders - Databases
4868        databases_folders = (
4869            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4870        )
4871        log.debug("Databases annotations: " + str(databases_folders))
4872
4873        # # Config - Java
4874        # java_bin = get_bin(
4875        #     tool="java",
4876        #     bin="java",
4877        #     bin_type="bin",
4878        #     config=config,
4879        #     default_folder="/usr/bin",
4880        # )
4881        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4882        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4883        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4884
4885        # # Config - snpEff bin
4886        # snpeff_jar = get_bin(
4887        #     tool="snpeff",
4888        #     bin="snpEff.jar",
4889        #     bin_type="jar",
4890        #     config=config,
4891        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4892        # )
4893        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4894        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4895        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4896
4897        # Config - snpEff bin command
4898        snpeff_bin_command = get_bin_command(
4899            bin="snpEff.jar",
4900            tool="snpeff",
4901            bin_type="jar",
4902            config=config,
4903            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4904        )
4905        if not snpeff_bin_command:
4906            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4907            log.error(msg_err)
4908            raise ValueError(msg_err)
4909
4910        # Config - snpEff databases
4911        snpeff_databases = (
4912            config.get("folders", {})
4913            .get("databases", {})
4914            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4915        )
4916        snpeff_databases = full_path(snpeff_databases)
4917        if snpeff_databases is not None and snpeff_databases != "":
4918            log.debug(f"Create snpEff databases folder")
4919            if not os.path.exists(snpeff_databases):
4920                os.makedirs(snpeff_databases)
4921
4922        # Param
4923        param = self.get_param()
4924        log.debug("Param: " + str(param))
4925
4926        # Param
4927        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4928        log.debug("Options: " + str(options))
4929
4930        # Param - Assembly
4931        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4932
4933        # Param - Options
4934        snpeff_options = (
4935            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4936        )
4937        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4938        snpeff_csvstats = (
4939            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4940        )
4941        if snpeff_stats:
4942            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4943            snpeff_stats = full_path(snpeff_stats)
4944            snpeff_options += f" -stats {snpeff_stats}"
4945        if snpeff_csvstats:
4946            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4947            snpeff_csvstats = full_path(snpeff_csvstats)
4948            snpeff_options += f" -csvStats {snpeff_csvstats}"
4949
4950        # Data
4951        table_variants = self.get_table_variants()
4952
4953        # Check if not empty
4954        log.debug("Check if not empty")
4955        sql_query_chromosomes = (
4956            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4957        )
4958        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4959        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4960            log.info(f"VCF empty")
4961            return
4962
4963        # Export in VCF
4964        log.debug("Create initial file to annotate")
4965        tmp_vcf = NamedTemporaryFile(
4966            prefix=self.get_prefix(),
4967            dir=self.get_tmp_dir(),
4968            suffix=".vcf.gz",
4969            delete=True,
4970        )
4971        tmp_vcf_name = tmp_vcf.name
4972
4973        # VCF header
4974        vcf_reader = self.get_header()
4975        log.debug("Initial header: " + str(vcf_reader.infos))
4976
4977        # Existing annotations
4978        for vcf_annotation in self.get_header().infos:
4979
4980            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4981            log.debug(
4982                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4983            )
4984
4985        # Memory limit
4986        # if config.get("memory", None):
4987        #     memory_limit = config.get("memory", "8G")
4988        # else:
4989        #     memory_limit = "8G"
4990        memory_limit = self.get_memory("8G")
4991        log.debug(f"memory_limit: {memory_limit}")
4992
4993        # snpEff java options
4994        snpeff_java_options = (
4995            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4996        )
4997        log.debug(f"Exomiser java options: {snpeff_java_options}")
4998
4999        force_update_annotation = True
5000
5001        if "ANN" not in self.get_header().infos or force_update_annotation:
5002
5003            # Check snpEff database
5004            log.debug(f"Check snpEff databases {[assembly]}")
5005            databases_download_snpeff(
5006                folder=snpeff_databases, assemblies=[assembly], config=config
5007            )
5008
5009            # Export VCF file
5010            self.export_variant_vcf(
5011                vcf_file=tmp_vcf_name,
5012                remove_info=True,
5013                add_samples=False,
5014                index=True,
5015            )
5016
5017            # Tmp file
5018            err_files = []
5019            tmp_annotate_vcf = NamedTemporaryFile(
5020                prefix=self.get_prefix(),
5021                dir=self.get_tmp_dir(),
5022                suffix=".vcf",
5023                delete=False,
5024            )
5025            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5026            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5027            err_files.append(tmp_annotate_vcf_name_err)
5028
5029            # Command
5030            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5031            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5032            run_parallel_commands([snpeff_command], 1)
5033
5034            # Error messages
5035            log.info(f"Error/Warning messages:")
5036            error_message_command_all = []
5037            error_message_command_warning = []
5038            error_message_command_err = []
5039            for err_file in err_files:
5040                with open(err_file, "r") as f:
5041                    for line in f:
5042                        message = line.strip()
5043                        error_message_command_all.append(message)
5044                        if line.startswith("[W::"):
5045                            error_message_command_warning.append(message)
5046                        if line.startswith("[E::"):
5047                            error_message_command_err.append(f"{err_file}: " + message)
5048            # log info
5049            for message in list(
5050                set(error_message_command_err + error_message_command_warning)
5051            ):
5052                log.info(f"   {message}")
5053            # debug info
5054            for message in list(set(error_message_command_all)):
5055                log.debug(f"   {message}")
5056            # failed
5057            if len(error_message_command_err):
5058                log.error("Annotation failed: Error in commands")
5059                raise ValueError("Annotation failed: Error in commands")
5060
5061            # Find annotation in header
5062            with open(tmp_annotate_vcf_name, "rt") as f:
5063                header_list = self.read_vcf_header(f)
5064            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5065
5066            for ann in annovar_vcf_header.infos:
5067                if ann not in self.get_header().infos:
5068                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5069
5070            # Update variants
5071            log.info(f"Annotation - Updating...")
5072            self.update_from_vcf(tmp_annotate_vcf_name)
5073
5074        else:
5075            if "ANN" in self.get_header().infos:
5076                log.debug(f"Existing snpEff annotations in VCF")
5077            if force_update_annotation:
5078                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5080    def annotation_annovar(self, threads: int = None) -> None:
5081        """
5082        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5083        annotations
5084
5085        :param threads: number of threads to use
5086        :return: the value of the variable "return_value".
5087        """
5088
5089        # DEBUG
5090        log.debug("Start annotation with Annovar databases")
5091
5092        # Threads
5093        if not threads:
5094            threads = self.get_threads()
5095        log.debug("Threads: " + str(threads))
5096
5097        # Tmp en Err files
5098        tmp_files = []
5099        err_files = []
5100
5101        # DEBUG
5102        delete_tmp = True
5103        if self.get_config().get("verbosity", "warning") in ["debug"]:
5104            delete_tmp = False
5105            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5106
5107        # Config
5108        config = self.get_config()
5109        log.debug("Config: " + str(config))
5110
5111        # Config - Folders - Databases
5112        databases_folders = (
5113            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5114        )
5115        log.debug("Databases annotations: " + str(databases_folders))
5116
5117        # Config - annovar bin command
5118        annovar_bin_command = get_bin_command(
5119            bin="table_annovar.pl",
5120            tool="annovar",
5121            bin_type="perl",
5122            config=config,
5123            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5124        )
5125        if not annovar_bin_command:
5126            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5127            log.error(msg_err)
5128            raise ValueError(msg_err)
5129
5130        # Config - BCFTools bin command
5131        bcftools_bin_command = get_bin_command(
5132            bin="bcftools",
5133            tool="bcftools",
5134            bin_type="bin",
5135            config=config,
5136            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5137        )
5138        if not bcftools_bin_command:
5139            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5140            log.error(msg_err)
5141            raise ValueError(msg_err)
5142
5143        # Config - annovar databases
5144        annovar_databases = (
5145            config.get("folders", {})
5146            .get("databases", {})
5147            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5148        )
5149        annovar_databases = full_path(annovar_databases)
5150        if annovar_databases != "" and not os.path.exists(annovar_databases):
5151            os.makedirs(annovar_databases)
5152
5153        # Param
5154        param = self.get_param()
5155        log.debug("Param: " + str(param))
5156
5157        # Param - options
5158        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5159        log.debug("Options: " + str(options))
5160
5161        # Param - annotations
5162        annotations = (
5163            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5164        )
5165        log.debug("Annotations: " + str(annotations))
5166
5167        # Param - Assembly
5168        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5169
5170        # Annovar database assembly
5171        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5172        if annovar_databases_assembly != "" and not os.path.exists(
5173            annovar_databases_assembly
5174        ):
5175            os.makedirs(annovar_databases_assembly)
5176
5177        # Data
5178        table_variants = self.get_table_variants()
5179
5180        # Check if not empty
5181        log.debug("Check if not empty")
5182        sql_query_chromosomes = (
5183            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5184        )
5185        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5186        if not sql_query_chromosomes_df["count"][0]:
5187            log.info(f"VCF empty")
5188            return
5189
5190        # VCF header
5191        vcf_reader = self.get_header()
5192        log.debug("Initial header: " + str(vcf_reader.infos))
5193
5194        # Existing annotations
5195        for vcf_annotation in self.get_header().infos:
5196
5197            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5198            log.debug(
5199                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5200            )
5201
5202        force_update_annotation = True
5203
5204        if annotations:
5205
5206            commands = []
5207            tmp_annotates_vcf_name_list = []
5208
5209            # Export in VCF
5210            log.debug("Create initial file to annotate")
5211            tmp_vcf = NamedTemporaryFile(
5212                prefix=self.get_prefix(),
5213                dir=self.get_tmp_dir(),
5214                suffix=".vcf.gz",
5215                delete=False,
5216            )
5217            tmp_vcf_name = tmp_vcf.name
5218            tmp_files.append(tmp_vcf_name)
5219            tmp_files.append(tmp_vcf_name + ".tbi")
5220
5221            # Export VCF file
5222            self.export_variant_vcf(
5223                vcf_file=tmp_vcf_name,
5224                remove_info=".",
5225                add_samples=False,
5226                index=True,
5227            )
5228
5229            # Create file for field rename
5230            log.debug("Create file for field rename")
5231            tmp_rename = NamedTemporaryFile(
5232                prefix=self.get_prefix(),
5233                dir=self.get_tmp_dir(),
5234                suffix=".rename",
5235                delete=False,
5236            )
5237            tmp_rename_name = tmp_rename.name
5238            tmp_files.append(tmp_rename_name)
5239
5240            # Check Annovar database
5241            log.debug(
5242                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5243            )
5244            databases_download_annovar(
5245                folder=annovar_databases,
5246                files=list(annotations.keys()),
5247                assemblies=[assembly],
5248            )
5249
5250            for annotation in annotations:
5251                annotation_fields = annotations[annotation]
5252
5253                if not annotation_fields:
5254                    annotation_fields = {"INFO": None}
5255
5256                log.info(f"Annotations Annovar - database '{annotation}'")
5257                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5258
5259                # Tmp file for annovar
5260                err_files = []
5261                tmp_annotate_vcf_directory = TemporaryDirectory(
5262                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5263                )
5264                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5265                tmp_annotate_vcf_name_annovar = (
5266                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5267                )
5268                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5269                err_files.append(tmp_annotate_vcf_name_err)
5270                tmp_files.append(tmp_annotate_vcf_name_err)
5271
5272                # Tmp file final vcf annotated by annovar
5273                tmp_annotate_vcf = NamedTemporaryFile(
5274                    prefix=self.get_prefix(),
5275                    dir=self.get_tmp_dir(),
5276                    suffix=".vcf.gz",
5277                    delete=False,
5278                )
5279                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5280                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5281                tmp_files.append(tmp_annotate_vcf_name)
5282                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5283
5284                # Number of fields
5285                annotation_list = []
5286                annotation_renamed_list = []
5287
5288                for annotation_field in annotation_fields:
5289
5290                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5291                    annotation_fields_new_name = annotation_fields.get(
5292                        annotation_field, annotation_field
5293                    )
5294                    if not annotation_fields_new_name:
5295                        annotation_fields_new_name = annotation_field
5296
5297                    if (
5298                        force_update_annotation
5299                        or annotation_fields_new_name not in self.get_header().infos
5300                    ):
5301                        annotation_list.append(annotation_field)
5302                        annotation_renamed_list.append(annotation_fields_new_name)
5303                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5304                        log.warning(
5305                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5306                        )
5307
5308                    # Add rename info
5309                    run_parallel_commands(
5310                        [
5311                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5312                        ],
5313                        1,
5314                    )
5315
5316                # log.debug("fields_to_removed: " + str(fields_to_removed))
5317                log.debug("annotation_list: " + str(annotation_list))
5318
5319                # protocol
5320                protocol = annotation
5321
5322                # argument
5323                argument = ""
5324
5325                # operation
5326                operation = "f"
5327                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5328                    "ensGene"
5329                ):
5330                    operation = "g"
5331                    if options.get("genebase", None):
5332                        argument = f"""'{options.get("genebase","")}'"""
5333                elif annotation in ["cytoBand"]:
5334                    operation = "r"
5335
5336                # argument option
5337                argument_option = ""
5338                if argument != "":
5339                    argument_option = " --argument " + argument
5340
5341                # command options
5342                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5343                for option in options:
5344                    if option not in ["genebase"]:
5345                        command_options += f""" --{option}={options[option]}"""
5346
5347                # Command
5348
5349                # Command - Annovar
5350                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5351                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5352
5353                # Command - start pipe
5354                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5355
5356                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5357                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5358
5359                # Command - Special characters (refGene annotation)
5360                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5361
5362                # Command - Clean empty fields (with value ".")
5363                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5364
5365                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5366                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5367                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5368                    # for ann in annotation_renamed_list:
5369                    for ann in annotation_list:
5370                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5371
5372                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5373
5374                # Command - indexing
5375                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5376
5377                log.debug(f"Annotation - Annovar command: {command_annovar}")
5378                run_parallel_commands([command_annovar], 1)
5379
5380                # Error messages
5381                log.info(f"Error/Warning messages:")
5382                error_message_command_all = []
5383                error_message_command_warning = []
5384                error_message_command_err = []
5385                for err_file in err_files:
5386                    with open(err_file, "r") as f:
5387                        for line in f:
5388                            message = line.strip()
5389                            error_message_command_all.append(message)
5390                            if line.startswith("[W::") or line.startswith("WARNING"):
5391                                error_message_command_warning.append(message)
5392                            if line.startswith("[E::") or line.startswith("ERROR"):
5393                                error_message_command_err.append(
5394                                    f"{err_file}: " + message
5395                                )
5396                # log info
5397                for message in list(
5398                    set(error_message_command_err + error_message_command_warning)
5399                ):
5400                    log.info(f"   {message}")
5401                # debug info
5402                for message in list(set(error_message_command_all)):
5403                    log.debug(f"   {message}")
5404                # failed
5405                if len(error_message_command_err):
5406                    log.error("Annotation failed: Error in commands")
5407                    raise ValueError("Annotation failed: Error in commands")
5408
5409            if tmp_annotates_vcf_name_list:
5410
5411                # List of annotated files
5412                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5413
5414                # Tmp file
5415                tmp_annotate_vcf = NamedTemporaryFile(
5416                    prefix=self.get_prefix(),
5417                    dir=self.get_tmp_dir(),
5418                    suffix=".vcf.gz",
5419                    delete=False,
5420                )
5421                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5422                tmp_files.append(tmp_annotate_vcf_name)
5423                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5424                err_files.append(tmp_annotate_vcf_name_err)
5425                tmp_files.append(tmp_annotate_vcf_name_err)
5426
5427                # Command merge
5428                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5429                log.info(
5430                    f"Annotation Annovar - Annotation merging "
5431                    + str(len(tmp_annotates_vcf_name_list))
5432                    + " annotated files"
5433                )
5434                log.debug(f"Annotation - merge command: {merge_command}")
5435                run_parallel_commands([merge_command], 1)
5436
5437                # Find annotation in header
5438                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5439                    header_list = self.read_vcf_header(f)
5440                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5441
5442                for ann in annovar_vcf_header.infos:
5443                    if ann not in self.get_header().infos:
5444                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5445
5446                # Update variants
5447                log.info(f"Annotation Annovar - Updating...")
5448                self.update_from_vcf(tmp_annotate_vcf_name)
5449
5450            # Clean files
5451            # Tmp file remove command
5452            if True:
5453                tmp_files_remove_command = ""
5454                if tmp_files:
5455                    tmp_files_remove_command = " ".join(tmp_files)
5456                clean_command = f" rm -f {tmp_files_remove_command} "
5457                log.debug(f"Annotation Annovar - Annotation cleaning ")
5458                log.debug(f"Annotation - cleaning command: {clean_command}")
5459                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5462    def annotation_parquet(self, threads: int = None) -> None:
5463        """
5464        It takes a VCF file, and annotates it with a parquet file
5465
5466        :param threads: number of threads to use for the annotation
5467        :return: the value of the variable "result".
5468        """
5469
5470        # DEBUG
5471        log.debug("Start annotation with parquet databases")
5472
5473        # Threads
5474        if not threads:
5475            threads = self.get_threads()
5476        log.debug("Threads: " + str(threads))
5477
5478        # DEBUG
5479        delete_tmp = True
5480        if self.get_config().get("verbosity", "warning") in ["debug"]:
5481            delete_tmp = False
5482            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5483
5484        # Config
5485        databases_folders = set(
5486            self.get_config()
5487            .get("folders", {})
5488            .get("databases", {})
5489            .get("annotations", ["."])
5490            + self.get_config()
5491            .get("folders", {})
5492            .get("databases", {})
5493            .get("parquet", ["."])
5494        )
5495        log.debug("Databases annotations: " + str(databases_folders))
5496
5497        # Param
5498        annotations = (
5499            self.get_param()
5500            .get("annotation", {})
5501            .get("parquet", {})
5502            .get("annotations", None)
5503        )
5504        log.debug("Annotations: " + str(annotations))
5505
5506        # Assembly
5507        assembly = self.get_param().get(
5508            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5509        )
5510
5511        # Force Update Annotation
5512        force_update_annotation = (
5513            self.get_param()
5514            .get("annotation", {})
5515            .get("options", {})
5516            .get("annotations_update", False)
5517        )
5518        log.debug(f"force_update_annotation={force_update_annotation}")
5519        force_append_annotation = (
5520            self.get_param()
5521            .get("annotation", {})
5522            .get("options", {})
5523            .get("annotations_append", False)
5524        )
5525        log.debug(f"force_append_annotation={force_append_annotation}")
5526
5527        # Data
5528        table_variants = self.get_table_variants()
5529
5530        # Check if not empty
5531        log.debug("Check if not empty")
5532        sql_query_chromosomes_df = self.get_query_to_df(
5533            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5534        )
5535        if not sql_query_chromosomes_df["count"][0]:
5536            log.info(f"VCF empty")
5537            return
5538
5539        # VCF header
5540        vcf_reader = self.get_header()
5541        log.debug("Initial header: " + str(vcf_reader.infos))
5542
5543        # Nb Variants POS
5544        log.debug("NB Variants Start")
5545        nb_variants = self.conn.execute(
5546            f"SELECT count(*) AS count FROM variants"
5547        ).fetchdf()["count"][0]
5548        log.debug("NB Variants Stop")
5549
5550        # Existing annotations
5551        for vcf_annotation in self.get_header().infos:
5552
5553            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5554            log.debug(
5555                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5556            )
5557
5558        # Added columns
5559        added_columns = []
5560
5561        # drop indexes
5562        log.debug(f"Drop indexes...")
5563        self.drop_indexes()
5564
5565        if annotations:
5566
5567            if "ALL" in annotations:
5568
5569                all_param = annotations.get("ALL", {})
5570                all_param_formats = all_param.get("formats", None)
5571                all_param_releases = all_param.get("releases", None)
5572
5573                databases_infos_dict = self.scan_databases(
5574                    database_formats=all_param_formats,
5575                    database_releases=all_param_releases,
5576                )
5577                for database_infos in databases_infos_dict.keys():
5578                    if database_infos not in annotations:
5579                        annotations[database_infos] = {"INFO": None}
5580
5581            for annotation in annotations:
5582
5583                if annotation in ["ALL"]:
5584                    continue
5585
5586                # Annotation Name
5587                annotation_name = os.path.basename(annotation)
5588
5589                # Annotation fields
5590                annotation_fields = annotations[annotation]
5591                if not annotation_fields:
5592                    annotation_fields = {"INFO": None}
5593
5594                log.debug(f"Annotation '{annotation_name}'")
5595                log.debug(
5596                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5597                )
5598
5599                # Create Database
5600                database = Database(
5601                    database=annotation,
5602                    databases_folders=databases_folders,
5603                    assembly=assembly,
5604                )
5605
5606                # Find files
5607                parquet_file = database.get_database()
5608                parquet_hdr_file = database.get_header_file()
5609                parquet_type = database.get_type()
5610
5611                # Check if files exists
5612                if not parquet_file or not parquet_hdr_file:
5613                    log.error("Annotation failed: file not found")
5614                    raise ValueError("Annotation failed: file not found")
5615                else:
5616                    # Get parquet connexion
5617                    parquet_sql_attach = database.get_sql_database_attach(
5618                        output="query"
5619                    )
5620                    if parquet_sql_attach:
5621                        self.conn.execute(parquet_sql_attach)
5622                    parquet_file_link = database.get_sql_database_link()
5623                    # Log
5624                    log.debug(
5625                        f"Annotation '{annotation_name}' - file: "
5626                        + str(parquet_file)
5627                        + " and "
5628                        + str(parquet_hdr_file)
5629                    )
5630
5631                    # Database full header columns
5632                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5633                        parquet_hdr_file
5634                    )
5635                    # Log
5636                    log.debug(
5637                        "Annotation database header columns : "
5638                        + str(parquet_hdr_vcf_header_columns)
5639                    )
5640
5641                    # Load header as VCF object
5642                    parquet_hdr_vcf_header_infos = database.get_header().infos
5643                    # Log
5644                    log.debug(
5645                        "Annotation database header: "
5646                        + str(parquet_hdr_vcf_header_infos)
5647                    )
5648
5649                    # Get extra infos
5650                    parquet_columns = database.get_extra_columns()
5651                    # Log
5652                    log.debug("Annotation database Columns: " + str(parquet_columns))
5653
5654                    # Add extra columns if "ALL" in annotation_fields
5655                    # if "ALL" in annotation_fields:
5656                    #     allow_add_extra_column = True
5657                    if "ALL" in annotation_fields and database.get_extra_columns():
5658                        for extra_column in database.get_extra_columns():
5659                            if (
5660                                extra_column not in annotation_fields
5661                                and extra_column.replace("INFO/", "")
5662                                not in parquet_hdr_vcf_header_infos
5663                            ):
5664                                parquet_hdr_vcf_header_infos[extra_column] = (
5665                                    vcf.parser._Info(
5666                                        extra_column,
5667                                        ".",
5668                                        "String",
5669                                        f"{extra_column} description",
5670                                        "unknown",
5671                                        "unknown",
5672                                        self.code_type_map["String"],
5673                                    )
5674                                )
5675
5676                    # For all fields in database
5677                    annotation_fields_all = False
5678                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5679                        annotation_fields_all = True
5680                        annotation_fields = {
5681                            key: key for key in parquet_hdr_vcf_header_infos
5682                        }
5683
5684                        log.debug(
5685                            "Annotation database header - All annotations added: "
5686                            + str(annotation_fields)
5687                        )
5688
5689                    # Init
5690
5691                    # List of annotation fields to use
5692                    sql_query_annotation_update_info_sets = []
5693
5694                    # List of annotation to agregate
5695                    sql_query_annotation_to_agregate = []
5696
5697                    # Number of fields
5698                    nb_annotation_field = 0
5699
5700                    # Annotation fields processed
5701                    annotation_fields_processed = []
5702
5703                    # Columns mapping
5704                    map_columns = database.map_columns(
5705                        columns=annotation_fields, prefixes=["INFO/"]
5706                    )
5707
5708                    # Query dict for fields to remove (update option)
5709                    query_dict_remove = {}
5710
5711                    # Fetch Anotation fields
5712                    for annotation_field in annotation_fields:
5713
5714                        # annotation_field_column
5715                        annotation_field_column = map_columns.get(
5716                            annotation_field, "INFO"
5717                        )
5718
5719                        # field new name, if parametered
5720                        annotation_fields_new_name = annotation_fields.get(
5721                            annotation_field, annotation_field
5722                        )
5723                        if not annotation_fields_new_name:
5724                            annotation_fields_new_name = annotation_field
5725
5726                        # To annotate
5727                        # force_update_annotation = True
5728                        # force_append_annotation = True
5729                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5730                        if annotation_field in parquet_hdr_vcf_header_infos and (
5731                            force_update_annotation
5732                            or force_append_annotation
5733                            or (
5734                                annotation_fields_new_name
5735                                not in self.get_header().infos
5736                            )
5737                        ):
5738
5739                            # Add field to annotation to process list
5740                            annotation_fields_processed.append(
5741                                annotation_fields_new_name
5742                            )
5743
5744                            # explode infos for the field
5745                            annotation_fields_new_name_info_msg = ""
5746                            if (
5747                                force_update_annotation
5748                                and annotation_fields_new_name
5749                                in self.get_header().infos
5750                            ):
5751                                # Remove field from INFO
5752                                query = f"""
5753                                    UPDATE {table_variants} as table_variants
5754                                    SET INFO = REGEXP_REPLACE(
5755                                                concat(table_variants.INFO,''),
5756                                                ';*{annotation_fields_new_name}=[^;]*',
5757                                                ''
5758                                                )
5759                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5760                                """
5761                                annotation_fields_new_name_info_msg = " [update]"
5762                                query_dict_remove[
5763                                    f"remove 'INFO/{annotation_fields_new_name}'"
5764                                ] = query
5765
5766                            # Sep between fields in INFO
5767                            nb_annotation_field += 1
5768                            if nb_annotation_field > 1:
5769                                annotation_field_sep = ";"
5770                            else:
5771                                annotation_field_sep = ""
5772
5773                            log.info(
5774                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5775                            )
5776
5777                            # Add INFO field to header
5778                            parquet_hdr_vcf_header_infos_number = (
5779                                parquet_hdr_vcf_header_infos[annotation_field].num
5780                                or "."
5781                            )
5782                            parquet_hdr_vcf_header_infos_type = (
5783                                parquet_hdr_vcf_header_infos[annotation_field].type
5784                                or "String"
5785                            )
5786                            parquet_hdr_vcf_header_infos_description = (
5787                                parquet_hdr_vcf_header_infos[annotation_field].desc
5788                                or f"{annotation_field} description"
5789                            )
5790                            parquet_hdr_vcf_header_infos_source = (
5791                                parquet_hdr_vcf_header_infos[annotation_field].source
5792                                or "unknown"
5793                            )
5794                            parquet_hdr_vcf_header_infos_version = (
5795                                parquet_hdr_vcf_header_infos[annotation_field].version
5796                                or "unknown"
5797                            )
5798
5799                            vcf_reader.infos[annotation_fields_new_name] = (
5800                                vcf.parser._Info(
5801                                    annotation_fields_new_name,
5802                                    parquet_hdr_vcf_header_infos_number,
5803                                    parquet_hdr_vcf_header_infos_type,
5804                                    parquet_hdr_vcf_header_infos_description,
5805                                    parquet_hdr_vcf_header_infos_source,
5806                                    parquet_hdr_vcf_header_infos_version,
5807                                    self.code_type_map[
5808                                        parquet_hdr_vcf_header_infos_type
5809                                    ],
5810                                )
5811                            )
5812
5813                            # Append
5814                            if force_append_annotation:
5815                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5816                            else:
5817                                query_case_when_append = ""
5818
5819                            # Annotation/Update query fields
5820                            # Found in INFO column
5821                            if (
5822                                annotation_field_column == "INFO"
5823                                and "INFO" in parquet_hdr_vcf_header_columns
5824                            ):
5825                                sql_query_annotation_update_info_sets.append(
5826                                    f"""
5827                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5828                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5829                                        ELSE ''
5830                                    END
5831                                """
5832                                )
5833                            # Found in a specific column
5834                            else:
5835                                # sql_query_annotation_update_info_sets.append(
5836                                #     f"""
5837                                # CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5838                                #         THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5839                                #         ELSE ''
5840                                #     END
5841                                # """
5842                                # )
5843                                sql_query_annotation_update_info_sets.append(
5844                                    f"""
5845                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5846                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
5847                                        ELSE ''
5848                                    END
5849                                """
5850                                )
5851                                sql_query_annotation_to_agregate.append(
5852                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5853                                )
5854
5855                        # Not to annotate
5856                        else:
5857
5858                            if force_update_annotation:
5859                                annotation_message = "forced"
5860                            else:
5861                                annotation_message = "skipped"
5862
5863                            if annotation_field not in parquet_hdr_vcf_header_infos:
5864                                log.warning(
5865                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5866                                )
5867                            if annotation_fields_new_name in self.get_header().infos:
5868                                log.warning(
5869                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5870                                )
5871
5872                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5873                    # allow_annotation_full_info = True
5874                    allow_annotation_full_info = not force_append_annotation
5875
5876                    if parquet_type in ["regions"]:
5877                        allow_annotation_full_info = False
5878
5879                    if (
5880                        allow_annotation_full_info
5881                        and nb_annotation_field == len(annotation_fields)
5882                        and annotation_fields_all
5883                        and (
5884                            "INFO" in parquet_hdr_vcf_header_columns
5885                            and "INFO" in database.get_extra_columns()
5886                        )
5887                    ):
5888                        log.debug("Column INFO annotation enabled")
5889                        sql_query_annotation_update_info_sets = []
5890                        sql_query_annotation_update_info_sets.append(
5891                            f" table_parquet.INFO "
5892                        )
5893
5894                    if sql_query_annotation_update_info_sets:
5895
5896                        # Annotate
5897                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5898
5899                        # Join query annotation update info sets for SQL
5900                        sql_query_annotation_update_info_sets_sql = ",".join(
5901                            sql_query_annotation_update_info_sets
5902                        )
5903
5904                        # Check chromosomes list (and variants infos)
5905                        sql_query_chromosomes = f"""
5906                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5907                            FROM {table_variants} as table_variants
5908                            GROUP BY table_variants."#CHROM"
5909                            ORDER BY table_variants."#CHROM"
5910                            """
5911                        sql_query_chromosomes_df = self.conn.execute(
5912                            sql_query_chromosomes
5913                        ).df()
5914                        sql_query_chromosomes_dict = {
5915                            entry["CHROM"]: {
5916                                "count": entry["count_variants"],
5917                                "min": entry["min_variants"],
5918                                "max": entry["max_variants"],
5919                            }
5920                            for index, entry in sql_query_chromosomes_df.iterrows()
5921                        }
5922
5923                        # Init
5924                        nb_of_query = 0
5925                        nb_of_variant_annotated = 0
5926                        query_dict = query_dict_remove
5927
5928                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5929                        for chrom in sql_query_chromosomes_dict:
5930
5931                            # Number of variant by chromosome
5932                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5933                                chrom, {}
5934                            ).get("count", 0)
5935
5936                            log.debug(
5937                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5938                            )
5939
5940                            # Annotation with regions database
5941                            if parquet_type in ["regions"]:
5942                                sql_query_annotation_from_clause = f"""
5943                                    FROM (
5944                                        SELECT 
5945                                            '{chrom}' AS \"#CHROM\",
5946                                            table_variants_from.\"POS\" AS \"POS\",
5947                                            {",".join(sql_query_annotation_to_agregate)}
5948                                        FROM {table_variants} as table_variants_from
5949                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5950                                            table_parquet_from."#CHROM" = '{chrom}'
5951                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5952                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5953                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5954                                                )
5955                                        )
5956                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5957                                        GROUP BY table_variants_from.\"POS\"
5958                                        )
5959                                        as table_parquet
5960                                """
5961
5962                                sql_query_annotation_where_clause = """
5963                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5964                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5965                                """
5966
5967                            # Annotation with variants database
5968                            else:
5969                                sql_query_annotation_from_clause = f"""
5970                                    FROM {parquet_file_link} as table_parquet
5971                                """
5972                                sql_query_annotation_where_clause = f"""
5973                                    table_variants."#CHROM" = '{chrom}'
5974                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5975                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5976                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5977                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5978                                """
5979
5980                            # Create update query
5981                            sql_query_annotation_chrom_interval_pos = f"""
5982                                UPDATE {table_variants} as table_variants
5983                                    SET INFO = 
5984                                        concat(
5985                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5986                                                THEN table_variants.INFO
5987                                                ELSE ''
5988                                            END
5989                                            ,
5990                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5991                                                        AND (
5992                                                        concat({sql_query_annotation_update_info_sets_sql})
5993                                                        )
5994                                                        NOT IN ('','.') 
5995                                                    THEN ';'
5996                                                    ELSE ''
5997                                            END
5998                                            ,
5999                                            {sql_query_annotation_update_info_sets_sql}
6000                                            )
6001                                    {sql_query_annotation_from_clause}
6002                                    WHERE {sql_query_annotation_where_clause}
6003                                    ;
6004                                """
6005
6006                            # Add update query to dict
6007                            query_dict[
6008                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6009                            ] = sql_query_annotation_chrom_interval_pos
6010
6011                        nb_of_query = len(query_dict)
6012                        num_query = 0
6013
6014                        # SET max_expression_depth TO x
6015                        self.conn.execute("SET max_expression_depth TO 10000")
6016
6017                        for query_name in query_dict:
6018                            query = query_dict[query_name]
6019                            num_query += 1
6020                            log.info(
6021                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6022                            )
6023                            result = self.conn.execute(query)
6024                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6025                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6026                            log.info(
6027                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6028                            )
6029
6030                        log.info(
6031                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6032                        )
6033
6034                    else:
6035
6036                        log.info(
6037                            f"Annotation '{annotation_name}' - No Annotations available"
6038                        )
6039
6040                    log.debug("Final header: " + str(vcf_reader.infos))
6041
6042        # Remove added columns
6043        for added_column in added_columns:
6044            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6046    def annotation_splice(self, threads: int = None) -> None:
6047        """
6048        This function annotate with snpEff
6049
6050        :param threads: The number of threads to use
6051        :return: the value of the variable "return_value".
6052        """
6053
6054        # DEBUG
6055        log.debug("Start annotation with splice tools")
6056
6057        # Threads
6058        if not threads:
6059            threads = self.get_threads()
6060        log.debug("Threads: " + str(threads))
6061
6062        # DEBUG
6063        delete_tmp = True
6064        if self.get_config().get("verbosity", "warning") in ["debug"]:
6065            delete_tmp = False
6066            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6067
6068        # Config
6069        config = self.get_config()
6070        log.debug("Config: " + str(config))
6071        splice_config = config.get("tools", {}).get("splice", {})
6072        if not splice_config:
6073            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6074        if not splice_config:
6075            msg_err = "No Splice tool config"
6076            log.error(msg_err)
6077            raise ValueError(msg_err)
6078        log.debug(f"splice_config={splice_config}")
6079
6080        # Config - Folders - Databases
6081        databases_folders = (
6082            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6083        )
6084        log.debug("Databases annotations: " + str(databases_folders))
6085
6086        # Splice docker image
6087        splice_docker_image = splice_config.get("docker").get("image")
6088
6089        # Pull splice image if it's not already there
6090        if not check_docker_image_exists(splice_docker_image):
6091            log.warning(
6092                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6093            )
6094            try:
6095                command(f"docker pull {splice_config.get('docker').get('image')}")
6096            except subprocess.CalledProcessError:
6097                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6098                log.error(msg_err)
6099                raise ValueError(msg_err)
6100                return None
6101
6102        # Config - splice databases
6103        splice_databases = (
6104            config.get("folders", {})
6105            .get("databases", {})
6106            .get("splice", DEFAULT_SPLICE_FOLDER)
6107        )
6108        splice_databases = full_path(splice_databases)
6109
6110        # Param
6111        param = self.get_param()
6112        log.debug("Param: " + str(param))
6113
6114        # Param
6115        options = param.get("annotation", {}).get("splice", {})
6116        log.debug("Options: " + str(options))
6117
6118        # Data
6119        table_variants = self.get_table_variants()
6120
6121        # Check if not empty
6122        log.debug("Check if not empty")
6123        sql_query_chromosomes = (
6124            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6125        )
6126        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6127            log.info("VCF empty")
6128            return None
6129
6130        # Export in VCF
6131        log.debug("Create initial file to annotate")
6132
6133        # Create output folder
6134        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6135        if not os.path.exists(output_folder):
6136            Path(output_folder).mkdir(parents=True, exist_ok=True)
6137
6138        # Create tmp VCF file
6139        tmp_vcf = NamedTemporaryFile(
6140            prefix=self.get_prefix(),
6141            dir=output_folder,
6142            suffix=".vcf",
6143            delete=False,
6144        )
6145        tmp_vcf_name = tmp_vcf.name
6146
6147        # VCF header
6148        header = self.get_header()
6149
6150        # Existing annotations
6151        for vcf_annotation in self.get_header().infos:
6152
6153            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6154            log.debug(
6155                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6156            )
6157
6158        # Memory limit
6159        if config.get("memory", None):
6160            memory_limit = config.get("memory", "8G").upper()
6161            # upper()
6162        else:
6163            memory_limit = "8G"
6164        log.debug(f"memory_limit: {memory_limit}")
6165
6166        # Check number of variants to annotate
6167        where_clause_regex_spliceai = r"SpliceAI_\w+"
6168        where_clause_regex_spip = r"SPiP_\w+"
6169        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6170        df_list_of_variants_to_annotate = self.get_query_to_df(
6171            query=f""" SELECT * FROM variants {where_clause} """
6172        )
6173        if len(df_list_of_variants_to_annotate) == 0:
6174            log.warning(
6175                f"No variants to annotate with splice. Variants probably already annotated with splice"
6176            )
6177            return None
6178        else:
6179            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6180
6181        # Export VCF file
6182        self.export_variant_vcf(
6183            vcf_file=tmp_vcf_name,
6184            remove_info=True,
6185            add_samples=True,
6186            index=False,
6187            where_clause=where_clause,
6188        )
6189
6190        # Create docker container and launch splice analysis
6191        if splice_config:
6192
6193            # Splice mount folders
6194            mount_folders = splice_config.get("mount", {})
6195
6196            # Genome mount
6197            mount_folders[
6198                config.get("folders", {})
6199                .get("databases", {})
6200                .get("genomes", DEFAULT_GENOME_FOLDER)
6201            ] = "ro"
6202
6203            # SpliceAI mount
6204            mount_folders[
6205                config.get("folders", {})
6206                .get("databases", {})
6207                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6208            ] = "ro"
6209
6210            # Genome mount
6211            mount_folders[
6212                config.get("folders", {})
6213                .get("databases", {})
6214                .get("spip", DEFAULT_SPIP_FOLDER)
6215            ] = "ro"
6216
6217            # Mount folders
6218            mount = []
6219
6220            # Config mount
6221            mount = [
6222                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6223                for path, mode in mount_folders.items()
6224            ]
6225
6226            if any(value for value in splice_config.values() if value is None):
6227                log.warning("At least one splice config parameter is empty")
6228                return None
6229
6230            # Params in splice nf
6231            def check_values(dico: dict):
6232                """
6233                Ensure parameters for NF splice pipeline
6234                """
6235                for key, val in dico.items():
6236                    if key == "genome":
6237                        if any(
6238                            assemb in options.get("genome", {})
6239                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6240                        ):
6241                            yield f"--{key} hg19"
6242                        elif any(
6243                            assemb in options.get("genome", {})
6244                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6245                        ):
6246                            yield f"--{key} hg38"
6247                    elif (
6248                        (isinstance(val, str) and val)
6249                        or isinstance(val, int)
6250                        or isinstance(val, bool)
6251                    ):
6252                        yield f"--{key} {val}"
6253
6254            # Genome
6255            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6256            options["genome"] = genome
6257
6258            # NF params
6259            nf_params = []
6260
6261            # Add options
6262            if options:
6263                nf_params = list(check_values(options))
6264                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6265            else:
6266                log.debug("No NF params provided")
6267
6268            # Add threads
6269            if "threads" not in options.keys():
6270                nf_params.append(f"--threads {threads}")
6271
6272            # Genome path
6273            genome_path = find_genome(
6274                config.get("folders", {})
6275                .get("databases", {})
6276                .get("genomes", DEFAULT_GENOME_FOLDER),
6277                file=f"{genome}.fa",
6278            )
6279            # Add genome path
6280            if not genome_path:
6281                raise ValueError(
6282                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6283                )
6284            else:
6285                log.debug(f"Genome: {genome_path}")
6286                nf_params.append(f"--genome_path {genome_path}")
6287
6288            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6289                """
6290                Setting up updated databases for SPiP and SpliceAI
6291                """
6292
6293                try:
6294
6295                    # SpliceAI assembly transcriptome
6296                    spliceai_assembly = os.path.join(
6297                        config.get("folders", {})
6298                        .get("databases", {})
6299                        .get("spliceai", {}),
6300                        options.get("genome"),
6301                        "transcriptome",
6302                    )
6303                    spip_assembly = options.get("genome")
6304
6305                    spip = find(
6306                        f"transcriptome_{spip_assembly}.RData",
6307                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6308                    )
6309                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6310                    log.debug(f"SPiP annotations: {spip}")
6311                    log.debug(f"SpliceAI annotations: {spliceai}")
6312                    if spip and spliceai:
6313                        return [
6314                            f"--spip_transcriptome {spip}",
6315                            f"--spliceai_annotations {spliceai}",
6316                        ]
6317                    else:
6318                        # TODO crash and go on with basic annotations ?
6319                        # raise ValueError(
6320                        #     "Can't find splice databases in configuration EXIT"
6321                        # )
6322                        log.warning(
6323                            "Can't find splice databases in configuration, use annotations file from image"
6324                        )
6325                except TypeError:
6326                    log.warning(
6327                        "Can't find splice databases in configuration, use annotations file from image"
6328                    )
6329                    return []
6330
6331            # Add options, check if transcriptome option have already beend provided
6332            if (
6333                "spip_transcriptome" not in nf_params
6334                and "spliceai_transcriptome" not in nf_params
6335            ):
6336                splice_reference = splice_annotations(options, config)
6337                if splice_reference:
6338                    nf_params.extend(splice_reference)
6339
6340            nf_params.append(f"--output_folder {output_folder}")
6341
6342            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6343            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6344            log.debug(cmd)
6345
6346            splice_config["docker"]["command"] = cmd
6347
6348            docker_cmd = get_bin_command(
6349                tool="splice",
6350                bin_type="docker",
6351                config=config,
6352                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6353                add_options=f"--name {random_uuid} {' '.join(mount)}",
6354            )
6355
6356            # Docker debug
6357            # if splice_config.get("rm_container"):
6358            #     rm_container = "--rm"
6359            # else:
6360            #     rm_container = ""
6361            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6362
6363            log.debug(docker_cmd)
6364            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6365            log.debug(res.stdout)
6366            if res.stderr:
6367                log.error(res.stderr)
6368            res.check_returncode()
6369        else:
6370            log.warning(f"Splice tool configuration not found: {config}")
6371
6372        # Update variants
6373        log.info("Annotation - Updating...")
6374        # Test find output vcf
6375        log.debug(
6376            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6377        )
6378        output_vcf = []
6379        # Wrong folder to look in
6380        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6381            if (
6382                files
6383                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6384            ):
6385                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6386        # log.debug(os.listdir(options.get("output_folder")))
6387        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6388        if not output_vcf:
6389            log.debug(
6390                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6391            )
6392        else:
6393            # Get new header from annotated vcf
6394            log.debug(f"Initial header: {len(header.infos)} fields")
6395            # Create new header with splice infos
6396            new_vcf = Variants(input=output_vcf[0])
6397            new_vcf_header = new_vcf.get_header().infos
6398            for keys, infos in new_vcf_header.items():
6399                if keys not in header.infos.keys():
6400                    header.infos[keys] = infos
6401            log.debug(f"New header: {len(header.infos)} fields")
6402            log.debug(f"Splice tmp output: {output_vcf[0]}")
6403            self.update_from_vcf(output_vcf[0])
6404
6405        # Remove folder
6406        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6412    def get_config_default(self, name: str) -> dict:
6413        """
6414        The function `get_config_default` returns a dictionary containing default configurations for
6415        various calculations and prioritizations.
6416
6417        :param name: The `get_config_default` function returns a dictionary containing default
6418        configurations for different calculations and prioritizations. The `name` parameter is used to
6419        specify which specific configuration to retrieve from the dictionary
6420        :type name: str
6421        :return: The function `get_config_default` returns a dictionary containing default configuration
6422        settings for different calculations and prioritizations. The specific configuration settings are
6423        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6424        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6425        returned. If there is no match, an empty dictionary is returned.
6426        """
6427
6428        config_default = {
6429            "calculations": {
6430                "variant_chr_pos_alt_ref": {
6431                    "type": "sql",
6432                    "name": "variant_chr_pos_alt_ref",
6433                    "description": "Create a variant ID with chromosome, position, alt and ref",
6434                    "available": False,
6435                    "output_column_name": "variant_chr_pos_alt_ref",
6436                    "output_column_type": "String",
6437                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6438                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6439                    "operation_info": True,
6440                },
6441                "VARTYPE": {
6442                    "type": "sql",
6443                    "name": "VARTYPE",
6444                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6445                    "available": True,
6446                    "output_column_name": "VARTYPE",
6447                    "output_column_type": "String",
6448                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6449                    "operation_query": """
6450                            CASE
6451                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6452                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6453                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6454                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6455                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6456                                ELSE 'UNDEFINED'
6457                            END
6458                            """,
6459                    "info_fields": ["SVTYPE"],
6460                    "operation_info": True,
6461                },
6462                "snpeff_hgvs": {
6463                    "type": "python",
6464                    "name": "snpeff_hgvs",
6465                    "description": "HGVS nomenclatures from snpEff annotation",
6466                    "available": True,
6467                    "function_name": "calculation_extract_snpeff_hgvs",
6468                    "function_params": ["snpeff_hgvs", "ANN"],
6469                },
6470                "snpeff_ann_explode": {
6471                    "type": "python",
6472                    "name": "snpeff_ann_explode",
6473                    "description": "Explode snpEff annotations with uniquify values",
6474                    "available": True,
6475                    "function_name": "calculation_snpeff_ann_explode",
6476                    "function_params": [False, "fields", "snpeff_", "ANN"],
6477                },
6478                "snpeff_ann_explode_uniquify": {
6479                    "type": "python",
6480                    "name": "snpeff_ann_explode_uniquify",
6481                    "description": "Explode snpEff annotations",
6482                    "available": True,
6483                    "function_name": "calculation_snpeff_ann_explode",
6484                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6485                },
6486                "snpeff_ann_explode_json": {
6487                    "type": "python",
6488                    "name": "snpeff_ann_explode_json",
6489                    "description": "Explode snpEff annotations in JSON format",
6490                    "available": True,
6491                    "function_name": "calculation_snpeff_ann_explode",
6492                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6493                },
6494                "NOMEN": {
6495                    "type": "python",
6496                    "name": "NOMEN",
6497                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6498                    "available": True,
6499                    "function_name": "calculation_extract_nomen",
6500                    "function_params": [],
6501                },
6502                "FINDBYPIPELINE": {
6503                    "type": "python",
6504                    "name": "FINDBYPIPELINE",
6505                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6506                    "available": True,
6507                    "function_name": "calculation_find_by_pipeline",
6508                    "function_params": ["findbypipeline"],
6509                },
6510                "FINDBYSAMPLE": {
6511                    "type": "python",
6512                    "name": "FINDBYSAMPLE",
6513                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6514                    "available": True,
6515                    "function_name": "calculation_find_by_pipeline",
6516                    "function_params": ["findbysample"],
6517                },
6518                "GENOTYPECONCORDANCE": {
6519                    "type": "python",
6520                    "name": "GENOTYPECONCORDANCE",
6521                    "description": "Concordance of genotype for multi caller VCF",
6522                    "available": True,
6523                    "function_name": "calculation_genotype_concordance",
6524                    "function_params": [],
6525                },
6526                "BARCODE": {
6527                    "type": "python",
6528                    "name": "BARCODE",
6529                    "description": "BARCODE as VaRank tool",
6530                    "available": True,
6531                    "function_name": "calculation_barcode",
6532                    "function_params": [],
6533                },
6534                "BARCODEFAMILY": {
6535                    "type": "python",
6536                    "name": "BARCODEFAMILY",
6537                    "description": "BARCODEFAMILY as VaRank tool",
6538                    "available": True,
6539                    "function_name": "calculation_barcode_family",
6540                    "function_params": ["BCF"],
6541                },
6542                "TRIO": {
6543                    "type": "python",
6544                    "name": "TRIO",
6545                    "description": "Inheritance for a trio family",
6546                    "available": True,
6547                    "function_name": "calculation_trio",
6548                    "function_params": [],
6549                },
6550                "VAF": {
6551                    "type": "python",
6552                    "name": "VAF",
6553                    "description": "Variant Allele Frequency (VAF) harmonization",
6554                    "available": True,
6555                    "function_name": "calculation_vaf_normalization",
6556                    "function_params": [],
6557                },
6558                "VAF_stats": {
6559                    "type": "python",
6560                    "name": "VAF_stats",
6561                    "description": "Variant Allele Frequency (VAF) statistics",
6562                    "available": True,
6563                    "function_name": "calculation_genotype_stats",
6564                    "function_params": ["VAF"],
6565                },
6566                "DP_stats": {
6567                    "type": "python",
6568                    "name": "DP_stats",
6569                    "description": "Depth (DP) statistics",
6570                    "available": True,
6571                    "function_name": "calculation_genotype_stats",
6572                    "function_params": ["DP"],
6573                },
6574                "variant_id": {
6575                    "type": "python",
6576                    "name": "variant_id",
6577                    "description": "Variant ID generated from variant position and type",
6578                    "available": True,
6579                    "function_name": "calculation_variant_id",
6580                    "function_params": [],
6581                },
6582                "transcripts_json": {
6583                    "type": "python",
6584                    "name": "transcripts_json",
6585                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6586                    "available": True,
6587                    "function_name": "calculation_transcripts_annotation",
6588                    "function_params": ["transcripts_json", None],
6589                },
6590                "transcripts_ann": {
6591                    "type": "python",
6592                    "name": "transcripts_ann",
6593                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6594                    "available": True,
6595                    "function_name": "calculation_transcripts_annotation",
6596                    "function_params": [None, "transcripts_ann"],
6597                },
6598                "transcripts_annotations": {
6599                    "type": "python",
6600                    "name": "transcripts_annotations",
6601                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6602                    "available": True,
6603                    "function_name": "calculation_transcripts_annotation",
6604                    "function_params": [None, None],
6605                },
6606                "transcripts_prioritization": {
6607                    "type": "python",
6608                    "name": "transcripts_prioritization",
6609                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6610                    "available": True,
6611                    "function_name": "calculation_transcripts_prioritization",
6612                    "function_params": [],
6613                },
6614            },
6615            "prioritizations": {
6616                "default": {
6617                    "filter": [
6618                        {
6619                            "type": "notequals",
6620                            "value": "!PASS|\\.",
6621                            "score": 0,
6622                            "flag": "FILTERED",
6623                            "comment": ["Bad variant quality"],
6624                        },
6625                        {
6626                            "type": "equals",
6627                            "value": "REJECT",
6628                            "score": -20,
6629                            "flag": "PASS",
6630                            "comment": ["Bad variant quality"],
6631                        },
6632                    ],
6633                    "DP": [
6634                        {
6635                            "type": "gte",
6636                            "value": "50",
6637                            "score": 5,
6638                            "flag": "PASS",
6639                            "comment": ["DP higher than 50"],
6640                        }
6641                    ],
6642                    "ANN": [
6643                        {
6644                            "type": "contains",
6645                            "value": "HIGH",
6646                            "score": 5,
6647                            "flag": "PASS",
6648                            "comment": [
6649                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6650                            ],
6651                        },
6652                        {
6653                            "type": "contains",
6654                            "value": "MODERATE",
6655                            "score": 3,
6656                            "flag": "PASS",
6657                            "comment": [
6658                                "A non-disruptive variant that might change protein effectiveness"
6659                            ],
6660                        },
6661                        {
6662                            "type": "contains",
6663                            "value": "LOW",
6664                            "score": 0,
6665                            "flag": "FILTERED",
6666                            "comment": [
6667                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6668                            ],
6669                        },
6670                        {
6671                            "type": "contains",
6672                            "value": "MODIFIER",
6673                            "score": 0,
6674                            "flag": "FILTERED",
6675                            "comment": [
6676                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6677                            ],
6678                        },
6679                    ],
6680                }
6681            },
6682        }
6683
6684        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6686    def get_config_json(
6687        self, name: str, config_dict: dict = {}, config_file: str = None
6688    ) -> dict:
6689        """
6690        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6691        default values, a dictionary, and a file.
6692
6693        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6694        the name of the configuration. It is used to identify and retrieve the configuration settings
6695        for a specific component or module
6696        :type name: str
6697        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6698        dictionary that allows you to provide additional configuration settings or overrides. When you
6699        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6700        the key is the configuration setting you want to override or
6701        :type config_dict: dict
6702        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6703        specify the path to a configuration file that contains additional settings. If provided, the
6704        function will read the contents of this file and update the configuration dictionary with the
6705        values found in the file, overriding any existing values with the
6706        :type config_file: str
6707        :return: The function `get_config_json` returns a dictionary containing the configuration
6708        settings.
6709        """
6710
6711        # Create with default prioritizations
6712        config_default = self.get_config_default(name=name)
6713        configuration = config_default
6714        # log.debug(f"configuration={configuration}")
6715
6716        # Replace prioritizations from dict
6717        for config in config_dict:
6718            configuration[config] = config_dict[config]
6719
6720        # Replace prioritizations from file
6721        config_file = full_path(config_file)
6722        if config_file:
6723            if os.path.exists(config_file):
6724                with open(config_file) as config_file_content:
6725                    config_file_dict = json.load(config_file_content)
6726                for config in config_file_dict:
6727                    configuration[config] = config_file_dict[config]
6728            else:
6729                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6730                log.error(msg_error)
6731                raise ValueError(msg_error)
6732
6733        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
6735    def prioritization(
6736        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
6737    ) -> bool:
6738        """
6739        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
6740        prioritizes variants based on configured profiles and criteria.
6741
6742        :param table: The `table` parameter in the `prioritization` function is used to specify the name
6743        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
6744        a table name is provided, the method will prioritize the variants in that specific table
6745        :type table: str
6746        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
6747        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
6748        provided, the code will use a default prefix value of "PZ"
6749        :type pz_prefix: str
6750        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
6751        additional parameters specific to the prioritization process. These parameters can include
6752        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
6753        configurations needed for the prioritization of variants in a V
6754        :type pz_param: dict
6755        :return: A boolean value (True) is being returned from the `prioritization` function.
6756        """
6757
6758        # Config
6759        config = self.get_config()
6760
6761        # Param
6762        param = self.get_param()
6763
6764        # Prioritization param
6765        if pz_param is not None:
6766            prioritization_param = pz_param
6767        else:
6768            prioritization_param = param.get("prioritization", {})
6769
6770        # Configuration profiles
6771        prioritization_config_file = prioritization_param.get(
6772            "prioritization_config", None
6773        )
6774        prioritization_config_file = full_path(prioritization_config_file)
6775        prioritizations_config = self.get_config_json(
6776            name="prioritizations", config_file=prioritization_config_file
6777        )
6778
6779        # Prioritization prefix
6780        pz_prefix_default = "PZ"
6781        if pz_prefix is None:
6782            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
6783
6784        # Prioritization options
6785        profiles = prioritization_param.get("profiles", [])
6786        if isinstance(profiles, str):
6787            profiles = profiles.split(",")
6788        pzfields = prioritization_param.get(
6789            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
6790        )
6791        if isinstance(pzfields, str):
6792            pzfields = pzfields.split(",")
6793        default_profile = prioritization_param.get("default_profile", None)
6794        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
6795        prioritization_score_mode = prioritization_param.get(
6796            "prioritization_score_mode", "HOWARD"
6797        )
6798
6799        # Quick Prioritizations
6800        prioritizations = param.get("prioritizations", None)
6801        if prioritizations:
6802            log.info("Quick Prioritization:")
6803            for profile in prioritizations.split(","):
6804                if profile not in profiles:
6805                    profiles.append(profile)
6806                    log.info(f"   {profile}")
6807
6808        # If profile "ALL" provided, all profiles in the config profiles
6809        if "ALL" in profiles:
6810            profiles = list(prioritizations_config.keys())
6811
6812        for profile in profiles:
6813            if prioritizations_config.get(profile, None):
6814                log.debug(f"Profile '{profile}' configured")
6815            else:
6816                msg_error = f"Profile '{profile}' NOT configured"
6817                log.error(msg_error)
6818                raise ValueError(msg_error)
6819
6820        if profiles:
6821            log.info(f"Prioritization... ")
6822        else:
6823            log.debug(f"No profile defined")
6824            return False
6825
6826        if not default_profile and len(profiles):
6827            default_profile = profiles[0]
6828
6829        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6830        log.debug("Profiles to check: " + str(list(profiles)))
6831
6832        # Variables
6833        if table is not None:
6834            table_variants = table
6835        else:
6836            table_variants = self.get_table_variants(clause="update")
6837        log.debug(f"Table to prioritize: {table_variants}")
6838
6839        # Added columns
6840        added_columns = []
6841
6842        # Create list of PZfields
6843        # List of PZFields
6844        list_of_pzfields_original = pzfields + [
6845            pzfield + pzfields_sep + profile
6846            for pzfield in pzfields
6847            for profile in profiles
6848        ]
6849        list_of_pzfields = []
6850        log.debug(f"{list_of_pzfields_original}")
6851
6852        # Remove existing PZfields to use if exists
6853        for pzfield in list_of_pzfields_original:
6854            if self.get_header().infos.get(pzfield, None) is None:
6855                list_of_pzfields.append(pzfield)
6856                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6857            else:
6858                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6859
6860        if list_of_pzfields:
6861
6862            # Explode Infos prefix
6863            explode_infos_prefix = self.get_explode_infos_prefix()
6864
6865            # PZfields tags description
6866            PZfields_INFOS = {
6867                f"{pz_prefix}Tags": {
6868                    "ID": f"{pz_prefix}Tags",
6869                    "Number": ".",
6870                    "Type": "String",
6871                    "Description": "Variant tags based on annotation criteria",
6872                },
6873                f"{pz_prefix}Score": {
6874                    "ID": f"{pz_prefix}Score",
6875                    "Number": 1,
6876                    "Type": "Integer",
6877                    "Description": "Variant score based on annotation criteria",
6878                },
6879                f"{pz_prefix}Flag": {
6880                    "ID": f"{pz_prefix}Flag",
6881                    "Number": 1,
6882                    "Type": "String",
6883                    "Description": "Variant flag based on annotation criteria",
6884                },
6885                f"{pz_prefix}Comment": {
6886                    "ID": f"{pz_prefix}Comment",
6887                    "Number": ".",
6888                    "Type": "String",
6889                    "Description": "Variant comment based on annotation criteria",
6890                },
6891                f"{pz_prefix}Infos": {
6892                    "ID": f"{pz_prefix}Infos",
6893                    "Number": ".",
6894                    "Type": "String",
6895                    "Description": "Variant infos based on annotation criteria",
6896                },
6897            }
6898
6899            # Create INFO fields if not exist
6900            for field in PZfields_INFOS:
6901                field_ID = PZfields_INFOS[field]["ID"]
6902                field_description = PZfields_INFOS[field]["Description"]
6903                if field_ID not in self.get_header().infos and field_ID in pzfields:
6904                    field_description = (
6905                        PZfields_INFOS[field]["Description"]
6906                        + f", profile {default_profile}"
6907                    )
6908                    self.get_header().infos[field_ID] = vcf.parser._Info(
6909                        field_ID,
6910                        PZfields_INFOS[field]["Number"],
6911                        PZfields_INFOS[field]["Type"],
6912                        field_description,
6913                        "unknown",
6914                        "unknown",
6915                        code_type_map[PZfields_INFOS[field]["Type"]],
6916                    )
6917
6918            # Create INFO fields if not exist for each profile
6919            for profile in prioritizations_config:
6920                if profile in profiles or profiles == []:
6921                    for field in PZfields_INFOS:
6922                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6923                        field_description = (
6924                            PZfields_INFOS[field]["Description"]
6925                            + f", profile {profile}"
6926                        )
6927                        if (
6928                            field_ID not in self.get_header().infos
6929                            and field in pzfields
6930                        ):
6931                            self.get_header().infos[field_ID] = vcf.parser._Info(
6932                                field_ID,
6933                                PZfields_INFOS[field]["Number"],
6934                                PZfields_INFOS[field]["Type"],
6935                                field_description,
6936                                "unknown",
6937                                "unknown",
6938                                code_type_map[PZfields_INFOS[field]["Type"]],
6939                            )
6940
6941            # Header
6942            for pzfield in list_of_pzfields:
6943                if re.match(f"{pz_prefix}Score.*", pzfield):
6944                    added_column = self.add_column(
6945                        table_name=table_variants,
6946                        column_name=pzfield,
6947                        column_type="INTEGER",
6948                        default_value="0",
6949                    )
6950                elif re.match(f"{pz_prefix}Flag.*", pzfield):
6951                    added_column = self.add_column(
6952                        table_name=table_variants,
6953                        column_name=pzfield,
6954                        column_type="BOOLEAN",
6955                        default_value="1",
6956                    )
6957                else:
6958                    added_column = self.add_column(
6959                        table_name=table_variants,
6960                        column_name=pzfield,
6961                        column_type="STRING",
6962                        default_value="''",
6963                    )
6964                added_columns.append(added_column)
6965
6966            # Profiles
6967            if profiles:
6968
6969                # foreach profile in configuration file
6970                for profile in prioritizations_config:
6971
6972                    # If profile is asked in param, or ALL are asked (empty profile [])
6973                    if profile in profiles or profiles == []:
6974                        log.info(f"Profile '{profile}'")
6975
6976                        sql_set_info_option = ""
6977
6978                        sql_set_info = []
6979
6980                        # PZ fields set
6981
6982                        # PZScore
6983                        if (
6984                            f"{pz_prefix}Score{pzfields_sep}{profile}"
6985                            in list_of_pzfields
6986                        ):
6987                            sql_set_info.append(
6988                                f"""
6989                                    concat(
6990                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
6991                                        {pz_prefix}Score{pzfields_sep}{profile}
6992                                    ) 
6993                                """
6994                            )
6995                            if (
6996                                profile == default_profile
6997                                and f"{pz_prefix}Score" in list_of_pzfields
6998                            ):
6999                                sql_set_info.append(
7000                                    f"""
7001                                        concat(
7002                                            '{pz_prefix}Score=',
7003                                            {pz_prefix}Score{pzfields_sep}{profile}
7004                                        )
7005                                    """
7006                                )
7007
7008                        # PZFlag
7009                        if (
7010                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7011                            in list_of_pzfields
7012                        ):
7013                            sql_set_info.append(
7014                                f"""
7015                                    concat(
7016                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7017                                        CASE 
7018                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7019                                            THEN 'PASS'
7020                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7021                                            THEN 'FILTERED'
7022                                        END
7023                                    ) 
7024                                """
7025                            )
7026                            if (
7027                                profile == default_profile
7028                                and f"{pz_prefix}Flag" in list_of_pzfields
7029                            ):
7030                                sql_set_info.append(
7031                                    f"""
7032                                        concat(
7033                                            '{pz_prefix}Flag=',
7034                                            CASE 
7035                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7036                                                THEN 'PASS'
7037                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7038                                                THEN 'FILTERED'
7039                                            END
7040                                        )
7041                                    """
7042                                )
7043
7044                        # PZComment
7045                        if (
7046                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7047                            in list_of_pzfields
7048                        ):
7049                            sql_set_info.append(
7050                                f"""
7051                                    CASE
7052                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7053                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7054                                        ELSE ''
7055                                    END
7056                                """
7057                            )
7058                            if (
7059                                profile == default_profile
7060                                and f"{pz_prefix}Comment" in list_of_pzfields
7061                            ):
7062                                sql_set_info.append(
7063                                    f"""
7064                                        CASE
7065                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7066                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7067                                            ELSE ''
7068                                        END
7069                                    """
7070                                )
7071
7072                        # PZInfos
7073                        if (
7074                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7075                            in list_of_pzfields
7076                        ):
7077                            sql_set_info.append(
7078                                f"""
7079                                    CASE
7080                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7081                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7082                                        ELSE ''
7083                                    END
7084                                """
7085                            )
7086                            if (
7087                                profile == default_profile
7088                                and f"{pz_prefix}Infos" in list_of_pzfields
7089                            ):
7090                                sql_set_info.append(
7091                                    f"""
7092                                        CASE
7093                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7094                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7095                                            ELSE ''
7096                                        END
7097                                    """
7098                                )
7099
7100                        # Merge PZfields
7101                        sql_set_info_option = ""
7102                        sql_set_sep = ""
7103                        for sql_set in sql_set_info:
7104                            if sql_set_sep:
7105                                sql_set_info_option += f"""
7106                                    , concat('{sql_set_sep}', {sql_set})
7107                                """
7108                            else:
7109                                sql_set_info_option += f"""
7110                                    , {sql_set}
7111                                """
7112                            sql_set_sep = ";"
7113
7114                        sql_queries = []
7115                        for annotation in prioritizations_config[profile]:
7116
7117                            # Explode specific annotation
7118                            log.debug(f"Explode annotation '{annotation}'")
7119                            added_columns += self.explode_infos(
7120                                prefix=explode_infos_prefix,
7121                                fields=[annotation],
7122                                table=table_variants,
7123                            )
7124                            extra_infos = self.get_extra_infos(table=table_variants)
7125
7126                            # Check if annotation field is present
7127                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
7128                                log.debug(f"Annotation '{annotation}' not in data")
7129                                continue
7130                            else:
7131                                log.debug(f"Annotation '{annotation}' in data")
7132
7133                            # For each criterions
7134                            for criterion in prioritizations_config[profile][
7135                                annotation
7136                            ]:
7137                                criterion_type = criterion["type"]
7138                                criterion_value = criterion["value"]
7139                                criterion_score = criterion.get("score", 0)
7140                                criterion_flag = criterion.get("flag", "PASS")
7141                                criterion_flag_bool = criterion_flag == "PASS"
7142                                criterion_comment = (
7143                                    ", ".join(criterion.get("comment", []))
7144                                    .replace("'", "''")
7145                                    .replace(";", ",")
7146                                    .replace("\t", " ")
7147                                )
7148                                criterion_infos = (
7149                                    str(criterion)
7150                                    .replace("'", "''")
7151                                    .replace(";", ",")
7152                                    .replace("\t", " ")
7153                                )
7154
7155                                sql_set = []
7156                                sql_set_info = []
7157
7158                                # PZ fields set
7159                                if (
7160                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7161                                    in list_of_pzfields
7162                                ):
7163                                    if prioritization_score_mode == "HOWARD":
7164                                        sql_set.append(
7165                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7166                                        )
7167                                    elif prioritization_score_mode == "VaRank":
7168                                        sql_set.append(
7169                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7170                                        )
7171                                    else:
7172                                        sql_set.append(
7173                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7174                                        )
7175                                if (
7176                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7177                                    in list_of_pzfields
7178                                ):
7179                                    sql_set.append(
7180                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7181                                    )
7182                                if (
7183                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7184                                    in list_of_pzfields
7185                                ):
7186                                    sql_set.append(
7187                                        f"""
7188                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7189                                                concat(
7190                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7191                                                    CASE 
7192                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7193                                                        THEN ', '
7194                                                        ELSE ''
7195                                                    END,
7196                                                    '{criterion_comment}'
7197                                                )
7198                                        """
7199                                    )
7200                                if (
7201                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7202                                    in list_of_pzfields
7203                                ):
7204                                    sql_set.append(
7205                                        f"""
7206                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7207                                                concat(
7208                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7209                                                    '{criterion_infos}'
7210                                                )
7211                                        """
7212                                    )
7213                                sql_set_option = ",".join(sql_set)
7214
7215                                # Criterion and comparison
7216                                if sql_set_option:
7217                                    try:
7218                                        float(criterion_value)
7219                                        sql_update = f"""
7220                                            UPDATE {table_variants}
7221                                            SET {sql_set_option}
7222                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7223                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7224                                            """
7225                                    except:
7226                                        contains_option = ""
7227                                        if criterion_type == "contains":
7228                                            contains_option = ".*"
7229                                        sql_update = f"""
7230                                            UPDATE {table_variants}
7231                                            SET {sql_set_option}
7232                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7233                                            """
7234                                    sql_queries.append(sql_update)
7235                                else:
7236                                    log.warning(
7237                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7238                                    )
7239
7240                        # PZTags
7241                        if (
7242                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7243                            in list_of_pzfields
7244                        ):
7245
7246                            # Create PZFalgs value
7247                            pztags_value = ""
7248                            pztags_sep_default = "|"
7249                            pztags_sep = ""
7250                            for pzfield in pzfields:
7251                                if pzfield not in [f"{pz_prefix}Tags"]:
7252                                    if (
7253                                        f"{pzfield}{pzfields_sep}{profile}"
7254                                        in list_of_pzfields
7255                                    ):
7256                                        if pzfield in [f"{pz_prefix}Flag"]:
7257                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7258                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7259                                                    THEN 'PASS'
7260                                                    ELSE 'FILTERED'
7261                                                END, '"""
7262                                        else:
7263                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7264                                        pztags_sep = pztags_sep_default
7265
7266                            # Add Query update for PZFlags
7267                            sql_update_pztags = f"""
7268                                UPDATE {table_variants}
7269                                SET INFO = concat(
7270                                        INFO,
7271                                        CASE WHEN INFO NOT in ('','.')
7272                                                THEN ';'
7273                                                ELSE ''
7274                                        END,
7275                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7276                                    )
7277                                """
7278                            sql_queries.append(sql_update_pztags)
7279
7280                            # Add Query update for PZFlags for default
7281                            if profile == default_profile:
7282                                sql_update_pztags_default = f"""
7283                                UPDATE {table_variants}
7284                                SET INFO = concat(
7285                                        INFO,
7286                                        ';',
7287                                        '{pz_prefix}Tags={pztags_value}'
7288                                    )
7289                                """
7290                                sql_queries.append(sql_update_pztags_default)
7291
7292                        log.info(f"""Profile '{profile}' - Prioritization... """)
7293
7294                        if sql_queries:
7295
7296                            for sql_query in sql_queries:
7297                                log.debug(
7298                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7299                                )
7300                                self.conn.execute(sql_query)
7301
7302                        log.info(f"""Profile '{profile}' - Update... """)
7303                        sql_query_update = f"""
7304                            UPDATE {table_variants}
7305                            SET INFO =  
7306                                concat(
7307                                    CASE
7308                                        WHEN INFO NOT IN ('','.')
7309                                        THEN concat(INFO, ';')
7310                                        ELSE ''
7311                                    END
7312                                    {sql_set_info_option}
7313                                )
7314                        """
7315                        self.conn.execute(sql_query_update)
7316
7317        else:
7318
7319            log.warning(f"No profiles in parameters")
7320
7321        # Remove added columns
7322        for added_column in added_columns:
7323            self.drop_column(column=added_column)
7324
7325        # Explode INFOS fields into table fields
7326        if self.get_explode_infos():
7327            self.explode_infos(
7328                prefix=self.get_explode_infos_prefix(),
7329                fields=self.get_explode_infos_fields(),
7330                force=True,
7331            )
7332
7333        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7339    def annotation_hgvs(self, threads: int = None) -> None:
7340        """
7341        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7342        coordinates and alleles.
7343
7344        :param threads: The `threads` parameter is an optional integer that specifies the number of
7345        threads to use for parallel processing. If no value is provided, it will default to the number
7346        of threads obtained from the `get_threads()` method
7347        :type threads: int
7348        """
7349
7350        # Function for each partition of the Dask Dataframe
7351        def partition_function(partition):
7352            """
7353            The function `partition_function` applies the `annotation_hgvs_partition` function to
7354            each row of a DataFrame called `partition`.
7355
7356            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7357            to be processed
7358            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7359            the "partition" dataframe along the axis 1.
7360            """
7361            return partition.apply(annotation_hgvs_partition, axis=1)
7362
7363        def annotation_hgvs_partition(row) -> str:
7364            """
7365            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7366            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7367
7368            :param row: A dictionary-like object that contains the values for the following keys:
7369            :return: a string that contains the HGVS names associated with the given row of data.
7370            """
7371
7372            chr = row["CHROM"]
7373            pos = row["POS"]
7374            ref = row["REF"]
7375            alt = row["ALT"]
7376
7377            # Find list of associated transcripts
7378            transcripts_list = list(
7379                polars_conn.execute(
7380                    f"""
7381                SELECT transcript
7382                FROM refseq_df
7383                WHERE CHROM='{chr}'
7384                AND POS={pos}
7385            """
7386                )["transcript"]
7387            )
7388
7389            # Full HGVS annotation in list
7390            hgvs_full_list = []
7391
7392            for transcript_name in transcripts_list:
7393
7394                # Transcript
7395                transcript = get_transcript(
7396                    transcripts=transcripts, transcript_name=transcript_name
7397                )
7398                # Exon
7399                if use_exon:
7400                    exon = transcript.find_exon_number(pos)
7401                else:
7402                    exon = None
7403                # Protein
7404                transcript_protein = None
7405                if use_protein or add_protein or full_format:
7406                    transcripts_protein = list(
7407                        polars_conn.execute(
7408                            f"""
7409                        SELECT protein
7410                        FROM refseqlink_df
7411                        WHERE transcript='{transcript_name}'
7412                        LIMIT 1
7413                    """
7414                        )["protein"]
7415                    )
7416                    if len(transcripts_protein):
7417                        transcript_protein = transcripts_protein[0]
7418
7419                # HGVS name
7420                hgvs_name = format_hgvs_name(
7421                    chr,
7422                    pos,
7423                    ref,
7424                    alt,
7425                    genome=genome,
7426                    transcript=transcript,
7427                    transcript_protein=transcript_protein,
7428                    exon=exon,
7429                    use_gene=use_gene,
7430                    use_protein=use_protein,
7431                    full_format=full_format,
7432                    use_version=use_version,
7433                    codon_type=codon_type,
7434                )
7435                hgvs_full_list.append(hgvs_name)
7436                if add_protein and not use_protein and not full_format:
7437                    hgvs_name = format_hgvs_name(
7438                        chr,
7439                        pos,
7440                        ref,
7441                        alt,
7442                        genome=genome,
7443                        transcript=transcript,
7444                        transcript_protein=transcript_protein,
7445                        exon=exon,
7446                        use_gene=use_gene,
7447                        use_protein=True,
7448                        full_format=False,
7449                        use_version=use_version,
7450                        codon_type=codon_type,
7451                    )
7452                    hgvs_full_list.append(hgvs_name)
7453
7454            # Create liste of HGVS annotations
7455            hgvs_full = ",".join(hgvs_full_list)
7456
7457            return hgvs_full
7458
7459        # Polars connexion
7460        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7461
7462        # Config
7463        config = self.get_config()
7464
7465        # Databases
7466        # Genome
7467        databases_genomes_folders = (
7468            config.get("folders", {})
7469            .get("databases", {})
7470            .get("genomes", DEFAULT_GENOME_FOLDER)
7471        )
7472        databases_genome = (
7473            config.get("folders", {}).get("databases", {}).get("genomes", "")
7474        )
7475        # refseq database folder
7476        databases_refseq_folders = (
7477            config.get("folders", {})
7478            .get("databases", {})
7479            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7480        )
7481        # refseq
7482        databases_refseq = config.get("databases", {}).get("refSeq", None)
7483        # refSeqLink
7484        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7485
7486        # Param
7487        param = self.get_param()
7488
7489        # Quick HGVS
7490        if "hgvs_options" in param and param.get("hgvs_options", ""):
7491            log.info(f"Quick HGVS Annotation:")
7492            if not param.get("hgvs", None):
7493                param["hgvs"] = {}
7494            for option in param.get("hgvs_options", "").split(","):
7495                option_var_val = option.split("=")
7496                option_var = option_var_val[0]
7497                if len(option_var_val) > 1:
7498                    option_val = option_var_val[1]
7499                else:
7500                    option_val = "True"
7501                if option_val.upper() in ["TRUE"]:
7502                    option_val = True
7503                elif option_val.upper() in ["FALSE"]:
7504                    option_val = False
7505                log.info(f"   {option_var}={option_val}")
7506                param["hgvs"][option_var] = option_val
7507
7508        # Check if HGVS annotation enabled
7509        if "hgvs" in param:
7510            log.info(f"HGVS Annotation... ")
7511            for hgvs_option in param.get("hgvs", {}):
7512                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7513        else:
7514            return
7515
7516        # HGVS Param
7517        param_hgvs = param.get("hgvs", {})
7518        use_exon = param_hgvs.get("use_exon", False)
7519        use_gene = param_hgvs.get("use_gene", False)
7520        use_protein = param_hgvs.get("use_protein", False)
7521        add_protein = param_hgvs.get("add_protein", False)
7522        full_format = param_hgvs.get("full_format", False)
7523        use_version = param_hgvs.get("use_version", False)
7524        codon_type = param_hgvs.get("codon_type", "3")
7525
7526        # refSseq refSeqLink
7527        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7528        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7529
7530        # Assembly
7531        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7532
7533        # Genome
7534        genome_file = None
7535        if find_genome(databases_genome):
7536            genome_file = find_genome(databases_genome)
7537        else:
7538            genome_file = find_genome(
7539                genome_path=databases_genomes_folders, assembly=assembly
7540            )
7541        log.debug("Genome: " + str(genome_file))
7542
7543        # refSseq
7544        refseq_file = find_file_prefix(
7545            input_file=databases_refseq,
7546            prefix="ncbiRefSeq",
7547            folder=databases_refseq_folders,
7548            assembly=assembly,
7549        )
7550        log.debug("refSeq: " + str(refseq_file))
7551
7552        # refSeqLink
7553        refseqlink_file = find_file_prefix(
7554            input_file=databases_refseqlink,
7555            prefix="ncbiRefSeqLink",
7556            folder=databases_refseq_folders,
7557            assembly=assembly,
7558        )
7559        log.debug("refSeqLink: " + str(refseqlink_file))
7560
7561        # Threads
7562        if not threads:
7563            threads = self.get_threads()
7564        log.debug("Threads: " + str(threads))
7565
7566        # Variables
7567        table_variants = self.get_table_variants(clause="update")
7568
7569        # Get variants SNV and InDel only
7570        query_variants = f"""
7571            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7572            FROM {table_variants}
7573            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7574            """
7575        df_variants = self.get_query_to_df(query_variants)
7576
7577        # Added columns
7578        added_columns = []
7579
7580        # Add hgvs column in variants table
7581        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7582        added_column = self.add_column(
7583            table_variants, hgvs_column_name, "STRING", default_value=None
7584        )
7585        added_columns.append(added_column)
7586
7587        log.debug(f"refSeq loading...")
7588        # refSeq in duckDB
7589        refseq_table = get_refseq_table(
7590            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7591        )
7592        # Loading all refSeq in Dataframe
7593        refseq_query = f"""
7594            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7595            FROM {refseq_table}
7596            JOIN df_variants ON (
7597                {refseq_table}.chrom = df_variants.CHROM
7598                AND {refseq_table}.txStart<=df_variants.POS
7599                AND {refseq_table}.txEnd>=df_variants.POS
7600            )
7601        """
7602        refseq_df = self.conn.query(refseq_query).pl()
7603
7604        if refseqlink_file:
7605            log.debug(f"refSeqLink loading...")
7606            # refSeqLink in duckDB
7607            refseqlink_table = get_refseq_table(
7608                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7609            )
7610            # Loading all refSeqLink in Dataframe
7611            protacc_column = "protAcc_with_ver"
7612            mrnaacc_column = "mrnaAcc_with_ver"
7613            refseqlink_query = f"""
7614                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7615                FROM {refseqlink_table} 
7616                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7617                WHERE protAcc_without_ver IS NOT NULL
7618            """
7619            # Polars Dataframe
7620            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7621
7622        # Read RefSeq transcripts into a python dict/model.
7623        log.debug(f"Transcripts loading...")
7624        with tempfile.TemporaryDirectory() as tmpdir:
7625            transcripts_query = f"""
7626                COPY (
7627                    SELECT {refseq_table}.*
7628                    FROM {refseq_table}
7629                    JOIN df_variants ON (
7630                        {refseq_table}.chrom=df_variants.CHROM
7631                        AND {refseq_table}.txStart<=df_variants.POS
7632                        AND {refseq_table}.txEnd>=df_variants.POS
7633                    )
7634                )
7635                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7636            """
7637            self.conn.query(transcripts_query)
7638            with open(f"{tmpdir}/transcript.tsv") as infile:
7639                transcripts = read_transcripts(infile)
7640
7641        # Polars connexion
7642        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7643
7644        log.debug("Genome loading...")
7645        # Read genome sequence using pyfaidx.
7646        genome = Fasta(genome_file)
7647
7648        log.debug("Start annotation HGVS...")
7649
7650        # Create
7651        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7652        ddf = dd.from_pandas(df_variants, npartitions=threads)
7653
7654        # Use dask.dataframe.apply() to apply function on each partition
7655        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7656
7657        # Convert Dask DataFrame to Pandas Dataframe
7658        df = ddf.compute()
7659
7660        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7661        with tempfile.TemporaryDirectory() as tmpdir:
7662            df_parquet = os.path.join(tmpdir, "df.parquet")
7663            df.to_parquet(df_parquet)
7664
7665            # Update hgvs column
7666            update_variant_query = f"""
7667                UPDATE {table_variants}
7668                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7669                FROM read_parquet('{df_parquet}') as df
7670                WHERE variants."#CHROM" = df.CHROM
7671                AND variants.POS = df.POS
7672                AND variants.REF = df.REF
7673                AND variants.ALT = df.ALT
7674                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7675                """
7676            self.execute_query(update_variant_query)
7677
7678        # Update INFO column
7679        sql_query_update = f"""
7680            UPDATE {table_variants}
7681            SET INFO = 
7682                concat(
7683                    CASE 
7684                        WHEN INFO NOT IN ('','.')
7685                        THEN concat(INFO, ';')
7686                        ELSE ''
7687                    END,
7688                    'hgvs=',
7689                    {hgvs_column_name}
7690                )
7691            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7692            """
7693        self.execute_query(sql_query_update)
7694
7695        # Add header
7696        HGVS_INFOS = {
7697            "hgvs": {
7698                "ID": "hgvs",
7699                "Number": ".",
7700                "Type": "String",
7701                "Description": f"HGVS annotatation with HOWARD",
7702            }
7703        }
7704
7705        for field in HGVS_INFOS:
7706            field_ID = HGVS_INFOS[field]["ID"]
7707            field_description = HGVS_INFOS[field]["Description"]
7708            self.get_header().infos[field_ID] = vcf.parser._Info(
7709                field_ID,
7710                HGVS_INFOS[field]["Number"],
7711                HGVS_INFOS[field]["Type"],
7712                field_description,
7713                "unknown",
7714                "unknown",
7715                code_type_map[HGVS_INFOS[field]["Type"]],
7716            )
7717
7718        # Remove added columns
7719        for added_column in added_columns:
7720            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7726    def get_operations_help(
7727        self, operations_config_dict: dict = {}, operations_config_file: str = None
7728    ) -> list:
7729
7730        # Init
7731        operations_help = []
7732
7733        # operations
7734        operations = self.get_config_json(
7735            name="calculations",
7736            config_dict=operations_config_dict,
7737            config_file=operations_config_file,
7738        )
7739        for op in operations:
7740            op_name = operations[op].get("name", op).upper()
7741            op_description = operations[op].get("description", op_name)
7742            op_available = operations[op].get("available", False)
7743            if op_available:
7744                operations_help.append(f"   {op_name}: {op_description}")
7745
7746        # Sort operations
7747        operations_help.sort()
7748
7749        # insert header
7750        operations_help.insert(0, "Available calculation operations:")
7751
7752        # Return
7753        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7755    def calculation(
7756        self,
7757        operations: dict = {},
7758        operations_config_dict: dict = {},
7759        operations_config_file: str = None,
7760    ) -> None:
7761        """
7762        It takes a list of operations, and for each operation, it checks if it's a python or sql
7763        operation, and then calls the appropriate function
7764
7765        param json example:
7766            "calculation": {
7767                "NOMEN": {
7768                    "options": {
7769                        "hgvs_field": "hgvs"
7770                    },
7771                "middle" : null
7772            }
7773        """
7774
7775        # Param
7776        param = self.get_param()
7777
7778        # operations config
7779        operations_config = self.get_config_json(
7780            name="calculations",
7781            config_dict=operations_config_dict,
7782            config_file=operations_config_file,
7783        )
7784
7785        # Upper keys
7786        operations_config = {k.upper(): v for k, v in operations_config.items()}
7787
7788        # Calculations
7789
7790        # Operations from param
7791        operations = param.get("calculation", {}).get("calculations", operations)
7792
7793        # Quick calculation - add
7794        if param.get("calculations", None):
7795            calculations_list = [
7796                value for value in param.get("calculations", "").split(",")
7797            ]
7798            log.info(f"Quick Calculations:")
7799            for calculation_key in calculations_list:
7800                log.info(f"   {calculation_key}")
7801            for calculation_operation in calculations_list:
7802                if calculation_operation.upper() not in operations:
7803                    operations[calculation_operation.upper()] = {}
7804                    add_value_into_dict(
7805                        dict_tree=param,
7806                        sections=[
7807                            "calculation",
7808                            "calculations",
7809                            calculation_operation.upper(),
7810                        ],
7811                        value={},
7812                    )
7813
7814        # Operations for calculation
7815        if not operations:
7816            operations = param.get("calculation", {}).get("calculations", {})
7817
7818        if operations:
7819            log.info(f"Calculations...")
7820
7821        # For each operations
7822        for operation_name in operations:
7823            operation_name = operation_name.upper()
7824            if operation_name not in [""]:
7825                if operation_name in operations_config:
7826                    log.info(f"Calculation '{operation_name}'")
7827                    operation = operations_config[operation_name]
7828                    operation_type = operation.get("type", "sql")
7829                    if operation_type == "python":
7830                        self.calculation_process_function(
7831                            operation=operation, operation_name=operation_name
7832                        )
7833                    elif operation_type == "sql":
7834                        self.calculation_process_sql(
7835                            operation=operation, operation_name=operation_name
7836                        )
7837                    else:
7838                        log.error(
7839                            f"Operations config: Type '{operation_type}' NOT available"
7840                        )
7841                        raise ValueError(
7842                            f"Operations config: Type '{operation_type}' NOT available"
7843                        )
7844                else:
7845                    log.error(
7846                        f"Operations config: Calculation '{operation_name}' NOT available"
7847                    )
7848                    raise ValueError(
7849                        f"Operations config: Calculation '{operation_name}' NOT available"
7850                    )
7851
7852        # Explode INFOS fields into table fields
7853        if self.get_explode_infos():
7854            self.explode_infos(
7855                prefix=self.get_explode_infos_prefix(),
7856                fields=self.get_explode_infos_fields(),
7857                force=True,
7858            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7860    def calculation_process_sql(
7861        self, operation: dict, operation_name: str = "unknown"
7862    ) -> None:
7863        """
7864        The `calculation_process_sql` function takes in a mathematical operation as a string and
7865        performs the operation, updating the specified table with the result.
7866
7867        :param operation: The `operation` parameter is a dictionary that contains information about the
7868        mathematical operation to be performed. It includes the following keys:
7869        :type operation: dict
7870        :param operation_name: The `operation_name` parameter is a string that represents the name of
7871        the mathematical operation being performed. It is used for logging and error handling purposes,
7872        defaults to unknown
7873        :type operation_name: str (optional)
7874        """
7875
7876        # table variants
7877        table_variants = self.get_table_variants(clause="alter")
7878
7879        # Operation infos
7880        operation_name = operation.get("name", "unknown")
7881        log.debug(f"process sql {operation_name}")
7882        output_column_name = operation.get("output_column_name", operation_name)
7883        output_column_type = operation.get("output_column_type", "String")
7884        prefix = operation.get("explode_infos_prefix", "")
7885        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7886        output_column_description = operation.get(
7887            "output_column_description", f"{operation_name} operation"
7888        )
7889        operation_query = operation.get("operation_query", None)
7890        if isinstance(operation_query, list):
7891            operation_query = " ".join(operation_query)
7892        operation_info_fields = operation.get("info_fields", [])
7893        operation_info_fields_check = operation.get("info_fields_check", False)
7894        operation_info = operation.get("operation_info", True)
7895
7896        if operation_query:
7897
7898            # Info fields check
7899            operation_info_fields_check_result = True
7900            if operation_info_fields_check:
7901                header_infos = self.get_header().infos
7902                for info_field in operation_info_fields:
7903                    operation_info_fields_check_result = (
7904                        operation_info_fields_check_result
7905                        and info_field in header_infos
7906                    )
7907
7908            # If info fields available
7909            if operation_info_fields_check_result:
7910
7911                # Added_columns
7912                added_columns = []
7913
7914                # Create VCF header field
7915                vcf_reader = self.get_header()
7916                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7917                    output_column_name,
7918                    ".",
7919                    output_column_type,
7920                    output_column_description,
7921                    "howard calculation",
7922                    "0",
7923                    self.code_type_map.get(output_column_type),
7924                )
7925
7926                # Explode infos if needed
7927                log.debug(f"calculation_process_sql prefix {prefix}")
7928                added_columns += self.explode_infos(
7929                    prefix=prefix,
7930                    fields=[output_column_name] + operation_info_fields,
7931                    force=True,
7932                )
7933
7934                # Create column
7935                added_column = self.add_column(
7936                    table_name=table_variants,
7937                    column_name=prefix + output_column_name,
7938                    column_type=output_column_type_sql,
7939                    default_value="null",
7940                )
7941                added_columns.append(added_column)
7942
7943                # Operation calculation
7944                try:
7945
7946                    # Query to update calculation column
7947                    sql_update = f"""
7948                        UPDATE {table_variants}
7949                        SET "{prefix}{output_column_name}" = ({operation_query})
7950                    """
7951                    self.conn.execute(sql_update)
7952
7953                    # Add to INFO
7954                    if operation_info:
7955                        sql_update_info = f"""
7956                            UPDATE {table_variants}
7957                            SET "INFO" =
7958                                concat(
7959                                    CASE
7960                                        WHEN "INFO" IS NOT NULL
7961                                        THEN concat("INFO", ';')
7962                                        ELSE ''
7963                                    END,
7964                                    '{output_column_name}=',
7965                                    "{prefix}{output_column_name}"
7966                                )
7967                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7968                        """
7969                        self.conn.execute(sql_update_info)
7970
7971                except:
7972                    log.error(
7973                        f"Operations config: Calculation '{operation_name}' query failed"
7974                    )
7975                    raise ValueError(
7976                        f"Operations config: Calculation '{operation_name}' query failed"
7977                    )
7978
7979                # Remove added columns
7980                for added_column in added_columns:
7981                    log.debug(f"added_column: {added_column}")
7982                    self.drop_column(column=added_column)
7983
7984            else:
7985                log.error(
7986                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7987                )
7988                raise ValueError(
7989                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7990                )
7991
7992        else:
7993            log.error(
7994                f"Operations config: Calculation '{operation_name}' query NOT defined"
7995            )
7996            raise ValueError(
7997                f"Operations config: Calculation '{operation_name}' query NOT defined"
7998            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8000    def calculation_process_function(
8001        self, operation: dict, operation_name: str = "unknown"
8002    ) -> None:
8003        """
8004        The `calculation_process_function` takes in an operation dictionary and performs the specified
8005        function with the given parameters.
8006
8007        :param operation: The `operation` parameter is a dictionary that contains information about the
8008        operation to be performed. It has the following keys:
8009        :type operation: dict
8010        :param operation_name: The `operation_name` parameter is a string that represents the name of
8011        the operation being performed. It is used for logging purposes, defaults to unknown
8012        :type operation_name: str (optional)
8013        """
8014
8015        operation_name = operation["name"]
8016        log.debug(f"process sql {operation_name}")
8017        function_name = operation["function_name"]
8018        function_params = operation["function_params"]
8019        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8021    def calculation_variant_id(self) -> None:
8022        """
8023        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8024        updates the INFO field of a variants table with the variant ID.
8025        """
8026
8027        # variant_id annotation field
8028        variant_id_tag = self.get_variant_id_column()
8029        added_columns = [variant_id_tag]
8030
8031        # variant_id hgvs tags"
8032        vcf_infos_tags = {
8033            variant_id_tag: "howard variant ID annotation",
8034        }
8035
8036        # Variants table
8037        table_variants = self.get_table_variants()
8038
8039        # Header
8040        vcf_reader = self.get_header()
8041
8042        # Add variant_id to header
8043        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8044            variant_id_tag,
8045            ".",
8046            "String",
8047            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8048            "howard calculation",
8049            "0",
8050            self.code_type_map.get("String"),
8051        )
8052
8053        # Update
8054        sql_update = f"""
8055            UPDATE {table_variants}
8056            SET "INFO" = 
8057                concat(
8058                    CASE
8059                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8060                        THEN ''
8061                        ELSE concat("INFO", ';')
8062                    END,
8063                    '{variant_id_tag}=',
8064                    "{variant_id_tag}"
8065                )
8066        """
8067        self.conn.execute(sql_update)
8068
8069        # Remove added columns
8070        for added_column in added_columns:
8071            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8073    def calculation_extract_snpeff_hgvs(
8074        self,
8075        snpeff_hgvs: str = "snpeff_hgvs",
8076        snpeff_field: str = "ANN",
8077    ) -> None:
8078        """
8079        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8080        annotation field in a VCF file and adds them as a new column in the variants table.
8081
8082        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8083        function is used to specify the name of the column that will store the HGVS nomenclatures
8084        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8085        snpeff_hgvs
8086        :type snpeff_hgvs: str (optional)
8087        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8088        function represents the field in the VCF file that contains SnpEff annotations. This field is
8089        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8090        to ANN
8091        :type snpeff_field: str (optional)
8092        """
8093
8094        # Snpeff hgvs tags
8095        vcf_infos_tags = {
8096            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8097        }
8098
8099        # Prefix
8100        prefix = self.get_explode_infos_prefix()
8101        if prefix:
8102            prefix = "INFO/"
8103
8104        # snpEff fields
8105        speff_ann_infos = prefix + snpeff_field
8106        speff_hgvs_infos = prefix + snpeff_hgvs
8107
8108        # Variants table
8109        table_variants = self.get_table_variants()
8110
8111        # Header
8112        vcf_reader = self.get_header()
8113
8114        # Add columns
8115        added_columns = []
8116
8117        # Explode HGVS field in column
8118        added_columns += self.explode_infos(fields=[snpeff_field])
8119
8120        if snpeff_field in vcf_reader.infos:
8121
8122            log.debug(vcf_reader.infos[snpeff_field])
8123
8124            # Extract ANN header
8125            ann_description = vcf_reader.infos[snpeff_field].desc
8126            pattern = r"'(.+?)'"
8127            match = re.search(pattern, ann_description)
8128            if match:
8129                ann_header_match = match.group(1).split(" | ")
8130                ann_header_desc = {}
8131                for i in range(len(ann_header_match)):
8132                    ann_header_info = "".join(
8133                        char for char in ann_header_match[i] if char.isalnum()
8134                    )
8135                    ann_header_desc[ann_header_info] = ann_header_match[i]
8136                if not ann_header_desc:
8137                    raise ValueError("Invalid header description format")
8138            else:
8139                raise ValueError("Invalid header description format")
8140
8141            # Create variant id
8142            variant_id_column = self.get_variant_id_column()
8143            added_columns += [variant_id_column]
8144
8145            # Create dataframe
8146            dataframe_snpeff_hgvs = self.get_query_to_df(
8147                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8148            )
8149
8150            # Create main NOMEN column
8151            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8152                speff_ann_infos
8153            ].apply(
8154                lambda x: extract_snpeff_hgvs(
8155                    str(x), header=list(ann_header_desc.values())
8156                )
8157            )
8158
8159            # Add snpeff_hgvs to header
8160            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8161                snpeff_hgvs,
8162                ".",
8163                "String",
8164                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8165                "howard calculation",
8166                "0",
8167                self.code_type_map.get("String"),
8168            )
8169
8170            # Update
8171            sql_update = f"""
8172                UPDATE variants
8173                SET "INFO" = 
8174                    concat(
8175                        CASE
8176                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8177                            THEN ''
8178                            ELSE concat("INFO", ';')
8179                        END,
8180                        CASE 
8181                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8182                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8183                            THEN concat(
8184                                    '{snpeff_hgvs}=',
8185                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8186                                )
8187                            ELSE ''
8188                        END
8189                    )
8190                FROM dataframe_snpeff_hgvs
8191                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8192
8193            """
8194            self.conn.execute(sql_update)
8195
8196            # Delete dataframe
8197            del dataframe_snpeff_hgvs
8198            gc.collect()
8199
8200        else:
8201
8202            log.warning(
8203                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8204            )
8205
8206        # Remove added columns
8207        for added_column in added_columns:
8208            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8210    def calculation_snpeff_ann_explode(
8211        self,
8212        uniquify: bool = True,
8213        output_format: str = "fields",
8214        output_prefix: str = "snpeff_",
8215        snpeff_field: str = "ANN",
8216    ) -> None:
8217        """
8218        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8219        exploding the HGVS field and updating variant information accordingly.
8220
8221        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8222        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8223        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8224        defaults to True
8225        :type uniquify: bool (optional)
8226        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8227        function specifies the format in which the output annotations will be generated. It has a
8228        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8229        format, defaults to fields
8230        :type output_format: str (optional)
8231        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8232        method is used to specify the prefix that will be added to the output annotations generated
8233        during the calculation process. This prefix helps to differentiate the newly added annotations
8234        from existing ones in the output data. By default, the, defaults to ANN_
8235        :type output_prefix: str (optional)
8236        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8237        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8238        field will be processed to explode the HGVS annotations and update the variant information
8239        accordingly, defaults to ANN
8240        :type snpeff_field: str (optional)
8241        """
8242
8243        # SnpEff annotation field
8244        snpeff_hgvs = "snpeff_ann_explode"
8245
8246        # Snpeff hgvs tags
8247        vcf_infos_tags = {
8248            snpeff_hgvs: "Explode snpEff annotations",
8249        }
8250
8251        # Prefix
8252        prefix = self.get_explode_infos_prefix()
8253        if prefix:
8254            prefix = "INFO/"
8255
8256        # snpEff fields
8257        speff_ann_infos = prefix + snpeff_field
8258        speff_hgvs_infos = prefix + snpeff_hgvs
8259
8260        # Variants table
8261        table_variants = self.get_table_variants()
8262
8263        # Header
8264        vcf_reader = self.get_header()
8265
8266        # Add columns
8267        added_columns = []
8268
8269        # Explode HGVS field in column
8270        added_columns += self.explode_infos(fields=[snpeff_field])
8271        log.debug(f"snpeff_field={snpeff_field}")
8272        log.debug(f"added_columns={added_columns}")
8273
8274        if snpeff_field in vcf_reader.infos:
8275
8276            # Extract ANN header
8277            ann_description = vcf_reader.infos[snpeff_field].desc
8278            pattern = r"'(.+?)'"
8279            match = re.search(pattern, ann_description)
8280            if match:
8281                ann_header_match = match.group(1).split(" | ")
8282                ann_header = []
8283                ann_header_desc = {}
8284                for i in range(len(ann_header_match)):
8285                    ann_header_info = "".join(
8286                        char for char in ann_header_match[i] if char.isalnum()
8287                    )
8288                    ann_header.append(ann_header_info)
8289                    ann_header_desc[ann_header_info] = ann_header_match[i]
8290                if not ann_header_desc:
8291                    raise ValueError("Invalid header description format")
8292            else:
8293                raise ValueError("Invalid header description format")
8294
8295            # Create variant id
8296            variant_id_column = self.get_variant_id_column()
8297            added_columns += [variant_id_column]
8298
8299            # Create dataframe
8300            dataframe_snpeff_hgvs = self.get_query_to_df(
8301                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8302            )
8303
8304            # Create snpEff columns
8305            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8306                speff_ann_infos
8307            ].apply(
8308                lambda x: explode_snpeff_ann(
8309                    str(x),
8310                    uniquify=uniquify,
8311                    output_format=output_format,
8312                    prefix=output_prefix,
8313                    header=list(ann_header_desc.values()),
8314                )
8315            )
8316
8317            # Header
8318            ann_annotations_prefix = ""
8319            if output_format.upper() in ["JSON"]:
8320                ann_annotations_prefix = f"{output_prefix}="
8321                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8322                    output_prefix,
8323                    ".",
8324                    "String",
8325                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8326                    + " - JSON format",
8327                    "howard calculation",
8328                    "0",
8329                    self.code_type_map.get("String"),
8330                )
8331            else:
8332                for ann_annotation in ann_header:
8333                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8334                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8335                        ann_annotation_id,
8336                        ".",
8337                        "String",
8338                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8339                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8340                        "howard calculation",
8341                        "0",
8342                        self.code_type_map.get("String"),
8343                    )
8344
8345            # Update
8346            sql_update = f"""
8347                UPDATE variants
8348                SET "INFO" = 
8349                    concat(
8350                        CASE
8351                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8352                            THEN ''
8353                            ELSE concat("INFO", ';')
8354                        END,
8355                        CASE 
8356                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8357                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8358                            THEN concat(
8359                                '{ann_annotations_prefix}',
8360                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8361                                )
8362                            ELSE ''
8363                        END
8364                    )
8365                FROM dataframe_snpeff_hgvs
8366                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8367
8368            """
8369            self.conn.execute(sql_update)
8370
8371            # Delete dataframe
8372            del dataframe_snpeff_hgvs
8373            gc.collect()
8374
8375        else:
8376
8377            log.warning(
8378                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8379            )
8380
8381        # Remove added columns
8382        for added_column in added_columns:
8383            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8385    def calculation_extract_nomen(self) -> None:
8386        """
8387        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8388        """
8389
8390        # NOMEN field
8391        field_nomen_dict = "NOMEN_DICT"
8392
8393        # NOMEN structure
8394        nomen_dict = {
8395            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8396            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8397            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8398            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8399            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8400            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8401            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8402            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8403            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8404            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8405        }
8406
8407        # Param
8408        param = self.get_param()
8409
8410        # Prefix
8411        prefix = self.get_explode_infos_prefix()
8412
8413        # Header
8414        vcf_reader = self.get_header()
8415
8416        # Get HGVS field
8417        hgvs_field = (
8418            param.get("calculation", {})
8419            .get("calculations", {})
8420            .get("NOMEN", {})
8421            .get("options", {})
8422            .get("hgvs_field", "hgvs")
8423        )
8424
8425        # Get transcripts
8426        transcripts_file = (
8427            param.get("calculation", {})
8428            .get("calculations", {})
8429            .get("NOMEN", {})
8430            .get("options", {})
8431            .get("transcripts", None)
8432        )
8433        transcripts_file = full_path(transcripts_file)
8434        transcripts = []
8435        if transcripts_file:
8436            if os.path.exists(transcripts_file):
8437                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8438                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8439            else:
8440                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8441                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8442
8443        # Added columns
8444        added_columns = []
8445
8446        # Explode HGVS field in column
8447        added_columns += self.explode_infos(fields=[hgvs_field])
8448
8449        # extra infos
8450        extra_infos = self.get_extra_infos()
8451        extra_field = prefix + hgvs_field
8452
8453        if extra_field in extra_infos:
8454
8455            # Create dataframe
8456            dataframe_hgvs = self.get_query_to_df(
8457                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8458            )
8459
8460            # Create main NOMEN column
8461            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8462                lambda x: find_nomen(str(x), transcripts=transcripts)
8463            )
8464
8465            # Explode NOMEN Structure and create SQL set for update
8466            sql_nomen_fields = []
8467            for nomen_field in nomen_dict:
8468
8469                # Explode each field into a column
8470                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8471                    lambda x: dict(x).get(nomen_field, "")
8472                )
8473
8474                # Create VCF header field
8475                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8476                    nomen_field,
8477                    ".",
8478                    "String",
8479                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8480                    "howard calculation",
8481                    "0",
8482                    self.code_type_map.get("String"),
8483                )
8484                sql_nomen_fields.append(
8485                    f"""
8486                        CASE 
8487                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8488                            THEN concat(
8489                                    ';{nomen_field}=',
8490                                    dataframe_hgvs."{nomen_field}"
8491                                )
8492                            ELSE ''
8493                        END
8494                    """
8495                )
8496
8497            # SQL set for update
8498            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8499
8500            # Update
8501            sql_update = f"""
8502                UPDATE variants
8503                SET "INFO" = 
8504                    concat(
8505                        CASE
8506                            WHEN "INFO" IS NULL
8507                            THEN ''
8508                            ELSE "INFO"
8509                        END,
8510                        {sql_nomen_fields_set}
8511                    )
8512                FROM dataframe_hgvs
8513                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8514                    AND variants."POS" = dataframe_hgvs."POS" 
8515                    AND variants."REF" = dataframe_hgvs."REF"
8516                    AND variants."ALT" = dataframe_hgvs."ALT"
8517            """
8518            self.conn.execute(sql_update)
8519
8520            # Delete dataframe
8521            del dataframe_hgvs
8522            gc.collect()
8523
8524        # Remove added columns
8525        for added_column in added_columns:
8526            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8528    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8529        """
8530        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8531        pipeline/sample for a variant and updates the variant information in a VCF file.
8532
8533        :param tag: The `tag` parameter is a string that represents the annotation field for the
8534        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8535        VCF header and to update the corresponding field in the variants table, defaults to
8536        findbypipeline
8537        :type tag: str (optional)
8538        """
8539
8540        # if FORMAT and samples
8541        if (
8542            "FORMAT" in self.get_header_columns_as_list()
8543            and self.get_header_sample_list()
8544        ):
8545
8546            # findbypipeline annotation field
8547            findbypipeline_tag = tag
8548
8549            # VCF infos tags
8550            vcf_infos_tags = {
8551                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8552            }
8553
8554            # Prefix
8555            prefix = self.get_explode_infos_prefix()
8556
8557            # Field
8558            findbypipeline_infos = prefix + findbypipeline_tag
8559
8560            # Variants table
8561            table_variants = self.get_table_variants()
8562
8563            # Header
8564            vcf_reader = self.get_header()
8565
8566            # Create variant id
8567            variant_id_column = self.get_variant_id_column()
8568            added_columns = [variant_id_column]
8569
8570            # variant_id, FORMAT and samples
8571            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8572                self.get_header_sample_list()
8573            )
8574
8575            # Create dataframe
8576            dataframe_findbypipeline = self.get_query_to_df(
8577                f""" SELECT {samples_fields} FROM {table_variants} """
8578            )
8579
8580            # Create findbypipeline column
8581            dataframe_findbypipeline[findbypipeline_infos] = (
8582                dataframe_findbypipeline.apply(
8583                    lambda row: findbypipeline(
8584                        row, samples=self.get_header_sample_list()
8585                    ),
8586                    axis=1,
8587                )
8588            )
8589
8590            # Add snpeff_hgvs to header
8591            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8592                findbypipeline_tag,
8593                ".",
8594                "String",
8595                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8596                "howard calculation",
8597                "0",
8598                self.code_type_map.get("String"),
8599            )
8600
8601            # Update
8602            sql_update = f"""
8603                UPDATE variants
8604                SET "INFO" = 
8605                    concat(
8606                        CASE
8607                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8608                            THEN ''
8609                            ELSE concat("INFO", ';')
8610                        END,
8611                        CASE 
8612                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8613                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8614                            THEN concat(
8615                                    '{findbypipeline_tag}=',
8616                                    dataframe_findbypipeline."{findbypipeline_infos}"
8617                                )
8618                            ELSE ''
8619                        END
8620                    )
8621                FROM dataframe_findbypipeline
8622                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8623            """
8624            self.conn.execute(sql_update)
8625
8626            # Remove added columns
8627            for added_column in added_columns:
8628                self.drop_column(column=added_column)
8629
8630            # Delete dataframe
8631            del dataframe_findbypipeline
8632            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8634    def calculation_genotype_concordance(self) -> None:
8635        """
8636        The function `calculation_genotype_concordance` calculates the genotype concordance for
8637        multi-caller VCF files and updates the variant information in the database.
8638        """
8639
8640        # if FORMAT and samples
8641        if (
8642            "FORMAT" in self.get_header_columns_as_list()
8643            and self.get_header_sample_list()
8644        ):
8645
8646            # genotypeconcordance annotation field
8647            genotypeconcordance_tag = "genotypeconcordance"
8648
8649            # VCF infos tags
8650            vcf_infos_tags = {
8651                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8652            }
8653
8654            # Prefix
8655            prefix = self.get_explode_infos_prefix()
8656
8657            # Field
8658            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8659
8660            # Variants table
8661            table_variants = self.get_table_variants()
8662
8663            # Header
8664            vcf_reader = self.get_header()
8665
8666            # Create variant id
8667            variant_id_column = self.get_variant_id_column()
8668            added_columns = [variant_id_column]
8669
8670            # variant_id, FORMAT and samples
8671            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8672                self.get_header_sample_list()
8673            )
8674
8675            # Create dataframe
8676            dataframe_genotypeconcordance = self.get_query_to_df(
8677                f""" SELECT {samples_fields} FROM {table_variants} """
8678            )
8679
8680            # Create genotypeconcordance column
8681            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8682                dataframe_genotypeconcordance.apply(
8683                    lambda row: genotypeconcordance(
8684                        row, samples=self.get_header_sample_list()
8685                    ),
8686                    axis=1,
8687                )
8688            )
8689
8690            # Add genotypeconcordance to header
8691            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8692                genotypeconcordance_tag,
8693                ".",
8694                "String",
8695                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8696                "howard calculation",
8697                "0",
8698                self.code_type_map.get("String"),
8699            )
8700
8701            # Update
8702            sql_update = f"""
8703                UPDATE variants
8704                SET "INFO" = 
8705                    concat(
8706                        CASE
8707                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8708                            THEN ''
8709                            ELSE concat("INFO", ';')
8710                        END,
8711                        CASE
8712                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8713                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8714                            THEN concat(
8715                                    '{genotypeconcordance_tag}=',
8716                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8717                                )
8718                            ELSE ''
8719                        END
8720                    )
8721                FROM dataframe_genotypeconcordance
8722                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8723            """
8724            self.conn.execute(sql_update)
8725
8726            # Remove added columns
8727            for added_column in added_columns:
8728                self.drop_column(column=added_column)
8729
8730            # Delete dataframe
8731            del dataframe_genotypeconcordance
8732            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8734    def calculation_barcode(self, tag: str = "barcode") -> None:
8735        """
8736        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8737        updates the INFO field in the file with the calculated barcode values.
8738
8739        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8740        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8741        the default tag name is set to "barcode", defaults to barcode
8742        :type tag: str (optional)
8743        """
8744
8745        # if FORMAT and samples
8746        if (
8747            "FORMAT" in self.get_header_columns_as_list()
8748            and self.get_header_sample_list()
8749        ):
8750
8751            # barcode annotation field
8752            if not tag:
8753                tag = "barcode"
8754
8755            # VCF infos tags
8756            vcf_infos_tags = {
8757                tag: "barcode calculation (VaRank)",
8758            }
8759
8760            # Prefix
8761            prefix = self.get_explode_infos_prefix()
8762
8763            # Field
8764            barcode_infos = prefix + tag
8765
8766            # Variants table
8767            table_variants = self.get_table_variants()
8768
8769            # Header
8770            vcf_reader = self.get_header()
8771
8772            # Create variant id
8773            variant_id_column = self.get_variant_id_column()
8774            added_columns = [variant_id_column]
8775
8776            # variant_id, FORMAT and samples
8777            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8778                self.get_header_sample_list()
8779            )
8780
8781            # Create dataframe
8782            dataframe_barcode = self.get_query_to_df(
8783                f""" SELECT {samples_fields} FROM {table_variants} """
8784            )
8785
8786            # Create barcode column
8787            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8788                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8789            )
8790
8791            # Add barcode to header
8792            vcf_reader.infos[tag] = vcf.parser._Info(
8793                tag,
8794                ".",
8795                "String",
8796                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8797                "howard calculation",
8798                "0",
8799                self.code_type_map.get("String"),
8800            )
8801
8802            # Update
8803            sql_update = f"""
8804                UPDATE {table_variants}
8805                SET "INFO" = 
8806                    concat(
8807                        CASE
8808                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8809                            THEN ''
8810                            ELSE concat("INFO", ';')
8811                        END,
8812                        CASE
8813                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8814                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8815                            THEN concat(
8816                                    '{tag}=',
8817                                    dataframe_barcode."{barcode_infos}"
8818                                )
8819                            ELSE ''
8820                        END
8821                    )
8822                FROM dataframe_barcode
8823                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8824            """
8825            self.conn.execute(sql_update)
8826
8827            # Remove added columns
8828            for added_column in added_columns:
8829                self.drop_column(column=added_column)
8830
8831            # Delete dataframe
8832            del dataframe_barcode
8833            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8835    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8836        """
8837        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8838        and updates the INFO field in the file with the calculated barcode values.
8839
8840        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8841        the barcode tag that will be added to the VCF file during the calculation process. If no value
8842        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8843        :type tag: str (optional)
8844        """
8845
8846        # if FORMAT and samples
8847        if (
8848            "FORMAT" in self.get_header_columns_as_list()
8849            and self.get_header_sample_list()
8850        ):
8851
8852            # barcode annotation field
8853            if not tag:
8854                tag = "BCF"
8855
8856            # VCF infos tags
8857            vcf_infos_tags = {
8858                tag: "barcode family calculation",
8859                f"{tag}S": "barcode family samples",
8860            }
8861
8862            # Param
8863            param = self.get_param()
8864            log.debug(f"param={param}")
8865
8866            # Prefix
8867            prefix = self.get_explode_infos_prefix()
8868
8869            # PED param
8870            ped = (
8871                param.get("calculation", {})
8872                .get("calculations", {})
8873                .get("BARCODEFAMILY", {})
8874                .get("family_pedigree", None)
8875            )
8876            log.debug(f"ped={ped}")
8877
8878            # Load PED
8879            if ped:
8880
8881                # Pedigree is a file
8882                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8883                    log.debug("Pedigree is file")
8884                    with open(full_path(ped)) as ped:
8885                        ped = json.load(ped)
8886
8887                # Pedigree is a string
8888                elif isinstance(ped, str):
8889                    log.debug("Pedigree is str")
8890                    try:
8891                        ped = json.loads(ped)
8892                        log.debug("Pedigree is json str")
8893                    except ValueError as e:
8894                        ped_samples = ped.split(",")
8895                        ped = {}
8896                        for ped_sample in ped_samples:
8897                            ped[ped_sample] = ped_sample
8898
8899                # Pedigree is a dict
8900                elif isinstance(ped, dict):
8901                    log.debug("Pedigree is dict")
8902
8903                # Pedigree is not well formatted
8904                else:
8905                    msg_error = "Pedigree not well formatted"
8906                    log.error(msg_error)
8907                    raise ValueError(msg_error)
8908
8909                # Construct list
8910                ped_samples = list(ped.values())
8911
8912            else:
8913                log.debug("Pedigree not defined. Take all samples")
8914                ped_samples = self.get_header_sample_list()
8915                ped = {}
8916                for ped_sample in ped_samples:
8917                    ped[ped_sample] = ped_sample
8918
8919            # Check pedigree
8920            if not ped or len(ped) == 0:
8921                msg_error = f"Error in pedigree: samples {ped_samples}"
8922                log.error(msg_error)
8923                raise ValueError(msg_error)
8924
8925            # Log
8926            log.info(
8927                "Calculation 'BARCODEFAMILY' - Samples: "
8928                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8929            )
8930            log.debug(f"ped_samples={ped_samples}")
8931
8932            # Field
8933            barcode_infos = prefix + tag
8934
8935            # Variants table
8936            table_variants = self.get_table_variants()
8937
8938            # Header
8939            vcf_reader = self.get_header()
8940
8941            # Create variant id
8942            variant_id_column = self.get_variant_id_column()
8943            added_columns = [variant_id_column]
8944
8945            # variant_id, FORMAT and samples
8946            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8947                ped_samples
8948            )
8949
8950            # Create dataframe
8951            dataframe_barcode = self.get_query_to_df(
8952                f""" SELECT {samples_fields} FROM {table_variants} """
8953            )
8954
8955            # Create barcode column
8956            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8957                lambda row: barcode(row, samples=ped_samples), axis=1
8958            )
8959
8960            # Add barcode family to header
8961            # Add vaf_normalization to header
8962            vcf_reader.formats[tag] = vcf.parser._Format(
8963                id=tag,
8964                num=".",
8965                type="String",
8966                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8967                type_code=self.code_type_map.get("String"),
8968            )
8969            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8970                id=f"{tag}S",
8971                num=".",
8972                type="String",
8973                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8974                type_code=self.code_type_map.get("String"),
8975            )
8976
8977            # Update
8978            # for sample in ped_samples:
8979            sql_update_set = []
8980            for sample in self.get_header_sample_list() + ["FORMAT"]:
8981                if sample in ped_samples:
8982                    value = f'dataframe_barcode."{barcode_infos}"'
8983                    value_samples = "'" + ",".join(ped_samples) + "'"
8984                elif sample == "FORMAT":
8985                    value = f"'{tag}'"
8986                    value_samples = f"'{tag}S'"
8987                else:
8988                    value = "'.'"
8989                    value_samples = "'.'"
8990                format_regex = r"[a-zA-Z0-9\s]"
8991                sql_update_set.append(
8992                    f"""
8993                        "{sample}" = 
8994                        concat(
8995                            CASE
8996                                WHEN {table_variants}."{sample}" = './.'
8997                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8998                                ELSE {table_variants}."{sample}"
8999                            END,
9000                            ':',
9001                            {value},
9002                            ':',
9003                            {value_samples}
9004                        )
9005                    """
9006                )
9007
9008            sql_update_set_join = ", ".join(sql_update_set)
9009            sql_update = f"""
9010                UPDATE {table_variants}
9011                SET {sql_update_set_join}
9012                FROM dataframe_barcode
9013                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9014            """
9015            self.conn.execute(sql_update)
9016
9017            # Remove added columns
9018            for added_column in added_columns:
9019                self.drop_column(column=added_column)
9020
9021            # Delete dataframe
9022            del dataframe_barcode
9023            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9025    def calculation_trio(self) -> None:
9026        """
9027        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9028        information to the INFO field of each variant.
9029        """
9030
9031        # if FORMAT and samples
9032        if (
9033            "FORMAT" in self.get_header_columns_as_list()
9034            and self.get_header_sample_list()
9035        ):
9036
9037            # trio annotation field
9038            trio_tag = "trio"
9039
9040            # VCF infos tags
9041            vcf_infos_tags = {
9042                "trio": "trio calculation",
9043            }
9044
9045            # Param
9046            param = self.get_param()
9047
9048            # Prefix
9049            prefix = self.get_explode_infos_prefix()
9050
9051            # Trio param
9052            trio_ped = (
9053                param.get("calculation", {})
9054                .get("calculations", {})
9055                .get("TRIO", {})
9056                .get("trio_pedigree", None)
9057            )
9058
9059            # Load trio
9060            if trio_ped:
9061
9062                # Trio pedigree is a file
9063                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9064                    log.debug("TRIO pedigree is file")
9065                    with open(full_path(trio_ped)) as trio_ped:
9066                        trio_ped = json.load(trio_ped)
9067
9068                # Trio pedigree is a string
9069                elif isinstance(trio_ped, str):
9070                    log.debug("TRIO pedigree is str")
9071                    try:
9072                        trio_ped = json.loads(trio_ped)
9073                        log.debug("TRIO pedigree is json str")
9074                    except ValueError as e:
9075                        trio_samples = trio_ped.split(",")
9076                        if len(trio_samples) == 3:
9077                            trio_ped = {
9078                                "father": trio_samples[0],
9079                                "mother": trio_samples[1],
9080                                "child": trio_samples[2],
9081                            }
9082                            log.debug("TRIO pedigree is list str")
9083                        else:
9084                            msg_error = "TRIO pedigree not well formatted"
9085                            log.error(msg_error)
9086                            raise ValueError(msg_error)
9087
9088                # Trio pedigree is a dict
9089                elif isinstance(trio_ped, dict):
9090                    log.debug("TRIO pedigree is dict")
9091
9092                # Trio pedigree is not well formatted
9093                else:
9094                    msg_error = "TRIO pedigree not well formatted"
9095                    log.error(msg_error)
9096                    raise ValueError(msg_error)
9097
9098                # Construct trio list
9099                trio_samples = [
9100                    trio_ped.get("father", ""),
9101                    trio_ped.get("mother", ""),
9102                    trio_ped.get("child", ""),
9103                ]
9104
9105            else:
9106                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9107                samples_list = self.get_header_sample_list()
9108                if len(samples_list) >= 3:
9109                    trio_samples = self.get_header_sample_list()[0:3]
9110                    trio_ped = {
9111                        "father": trio_samples[0],
9112                        "mother": trio_samples[1],
9113                        "child": trio_samples[2],
9114                    }
9115                else:
9116                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9117                    log.error(msg_error)
9118                    raise ValueError(msg_error)
9119
9120            # Check trio pedigree
9121            if not trio_ped or len(trio_ped) != 3:
9122                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9123                log.error(msg_error)
9124                raise ValueError(msg_error)
9125
9126            # Log
9127            log.info(
9128                f"Calculation 'TRIO' - Samples: "
9129                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9130            )
9131
9132            # Field
9133            trio_infos = prefix + trio_tag
9134
9135            # Variants table
9136            table_variants = self.get_table_variants()
9137
9138            # Header
9139            vcf_reader = self.get_header()
9140
9141            # Create variant id
9142            variant_id_column = self.get_variant_id_column()
9143            added_columns = [variant_id_column]
9144
9145            # variant_id, FORMAT and samples
9146            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9147                self.get_header_sample_list()
9148            )
9149
9150            # Create dataframe
9151            dataframe_trio = self.get_query_to_df(
9152                f""" SELECT {samples_fields} FROM {table_variants} """
9153            )
9154
9155            # Create trio column
9156            dataframe_trio[trio_infos] = dataframe_trio.apply(
9157                lambda row: trio(row, samples=trio_samples), axis=1
9158            )
9159
9160            # Add trio to header
9161            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9162                trio_tag,
9163                ".",
9164                "String",
9165                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9166                "howard calculation",
9167                "0",
9168                self.code_type_map.get("String"),
9169            )
9170
9171            # Update
9172            sql_update = f"""
9173                UPDATE {table_variants}
9174                SET "INFO" = 
9175                    concat(
9176                        CASE
9177                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9178                            THEN ''
9179                            ELSE concat("INFO", ';')
9180                        END,
9181                        CASE
9182                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9183                             AND dataframe_trio."{trio_infos}" NOT NULL
9184                            THEN concat(
9185                                    '{trio_tag}=',
9186                                    dataframe_trio."{trio_infos}"
9187                                )
9188                            ELSE ''
9189                        END
9190                    )
9191                FROM dataframe_trio
9192                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9193            """
9194            self.conn.execute(sql_update)
9195
9196            # Remove added columns
9197            for added_column in added_columns:
9198                self.drop_column(column=added_column)
9199
9200            # Delete dataframe
9201            del dataframe_trio
9202            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9204    def calculation_vaf_normalization(self) -> None:
9205        """
9206        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9207        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9208        :return: The function does not return anything.
9209        """
9210
9211        # if FORMAT and samples
9212        if (
9213            "FORMAT" in self.get_header_columns_as_list()
9214            and self.get_header_sample_list()
9215        ):
9216
9217            # vaf_normalization annotation field
9218            vaf_normalization_tag = "VAF"
9219
9220            # VCF infos tags
9221            vcf_infos_tags = {
9222                "VAF": "VAF Variant Frequency",
9223            }
9224
9225            # Prefix
9226            prefix = self.get_explode_infos_prefix()
9227
9228            # Variants table
9229            table_variants = self.get_table_variants()
9230
9231            # Header
9232            vcf_reader = self.get_header()
9233
9234            # Do not calculate if VAF already exists
9235            if "VAF" in vcf_reader.formats:
9236                log.debug("VAF already on genotypes")
9237                return
9238
9239            # Create variant id
9240            variant_id_column = self.get_variant_id_column()
9241            added_columns = [variant_id_column]
9242
9243            # variant_id, FORMAT and samples
9244            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9245                f""" "{sample}" """ for sample in self.get_header_sample_list()
9246            )
9247
9248            # Create dataframe
9249            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9250            log.debug(f"query={query}")
9251            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9252
9253            vaf_normalization_set = []
9254
9255            # for each sample vaf_normalization
9256            for sample in self.get_header_sample_list():
9257                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9258                    lambda row: vaf_normalization(row, sample=sample), axis=1
9259                )
9260                vaf_normalization_set.append(
9261                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9262                )
9263
9264            # Add VAF to FORMAT
9265            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9266                "FORMAT"
9267            ].apply(lambda x: str(x) + ":VAF")
9268            vaf_normalization_set.append(
9269                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9270            )
9271
9272            # Add vaf_normalization to header
9273            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9274                id=vaf_normalization_tag,
9275                num="1",
9276                type="Float",
9277                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9278                type_code=self.code_type_map.get("Float"),
9279            )
9280
9281            # Create fields to add in INFO
9282            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9283
9284            # Update
9285            sql_update = f"""
9286                UPDATE {table_variants}
9287                SET {sql_vaf_normalization_set}
9288                FROM dataframe_vaf_normalization
9289                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9290
9291            """
9292            self.conn.execute(sql_update)
9293
9294            # Remove added columns
9295            for added_column in added_columns:
9296                self.drop_column(column=added_column)
9297
9298            # Delete dataframe
9299            del dataframe_vaf_normalization
9300            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9302    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9303        """
9304        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9305        field in a VCF file and updates the INFO column of the variants table with the calculated
9306        statistics.
9307
9308        :param info: The `info` parameter is a string that represents the type of information for which
9309        genotype statistics are calculated. It is used to generate various VCF info tags for the
9310        statistics, such as the number of occurrences, the list of values, the minimum value, the
9311        maximum value, the mean, the median, defaults to VAF
9312        :type info: str (optional)
9313        """
9314
9315        # if FORMAT and samples
9316        if (
9317            "FORMAT" in self.get_header_columns_as_list()
9318            and self.get_header_sample_list()
9319        ):
9320
9321            # vaf_stats annotation field
9322            vaf_stats_tag = info + "_stats"
9323
9324            # VCF infos tags
9325            vcf_infos_tags = {
9326                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9327                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9328                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9329                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9330                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9331                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9332                info
9333                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9334            }
9335
9336            # Prefix
9337            prefix = self.get_explode_infos_prefix()
9338
9339            # Field
9340            vaf_stats_infos = prefix + vaf_stats_tag
9341
9342            # Variants table
9343            table_variants = self.get_table_variants()
9344
9345            # Header
9346            vcf_reader = self.get_header()
9347
9348            # Create variant id
9349            variant_id_column = self.get_variant_id_column()
9350            added_columns = [variant_id_column]
9351
9352            # variant_id, FORMAT and samples
9353            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9354                self.get_header_sample_list()
9355            )
9356
9357            # Create dataframe
9358            dataframe_vaf_stats = self.get_query_to_df(
9359                f""" SELECT {samples_fields} FROM {table_variants} """
9360            )
9361
9362            # Create vaf_stats column
9363            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9364                lambda row: genotype_stats(
9365                    row, samples=self.get_header_sample_list(), info=info
9366                ),
9367                axis=1,
9368            )
9369
9370            # List of vcf tags
9371            sql_vaf_stats_fields = []
9372
9373            # Check all VAF stats infos
9374            for stat in vcf_infos_tags:
9375
9376                # Extract stats
9377                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9378                    lambda x: dict(x).get(stat, "")
9379                )
9380
9381                # Add snpeff_hgvs to header
9382                vcf_reader.infos[stat] = vcf.parser._Info(
9383                    stat,
9384                    ".",
9385                    "String",
9386                    vcf_infos_tags.get(stat, "genotype statistics"),
9387                    "howard calculation",
9388                    "0",
9389                    self.code_type_map.get("String"),
9390                )
9391
9392                if len(sql_vaf_stats_fields):
9393                    sep = ";"
9394                else:
9395                    sep = ""
9396
9397                # Create fields to add in INFO
9398                sql_vaf_stats_fields.append(
9399                    f"""
9400                        CASE
9401                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9402                            THEN concat(
9403                                    '{sep}{stat}=',
9404                                    dataframe_vaf_stats."{stat}"
9405                                )
9406                            ELSE ''
9407                        END
9408                    """
9409                )
9410
9411            # SQL set for update
9412            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9413
9414            # Update
9415            sql_update = f"""
9416                UPDATE {table_variants}
9417                SET "INFO" = 
9418                    concat(
9419                        CASE
9420                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9421                            THEN ''
9422                            ELSE concat("INFO", ';')
9423                        END,
9424                        {sql_vaf_stats_fields_set}
9425                    )
9426                FROM dataframe_vaf_stats
9427                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9428
9429            """
9430            self.conn.execute(sql_update)
9431
9432            # Remove added columns
9433            for added_column in added_columns:
9434                self.drop_column(column=added_column)
9435
9436            # Delete dataframe
9437            del dataframe_vaf_stats
9438            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
9440    def calculation_transcripts_annotation(
9441        self, info_json: str = None, info_format: str = None
9442    ) -> None:
9443        """
9444        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
9445        field to it if transcripts are available.
9446
9447        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
9448        is a string parameter that represents the information field to be used in the transcripts JSON.
9449        It is used to specify the JSON format for the transcripts information. If no value is provided
9450        when calling the method, it defaults to "
9451        :type info_json: str
9452        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
9453        method is a string parameter that specifies the format of the information field to be used in
9454        the transcripts JSON. It is used to define the format of the information field
9455        :type info_format: str
9456        """
9457
9458        # Create transcripts table
9459        transcripts_table = self.create_transcript_view()
9460
9461        # Add info field
9462        if transcripts_table:
9463            self.transcript_view_to_variants(
9464                transcripts_table=transcripts_table,
9465                transcripts_info_field_json=info_json,
9466                transcripts_info_field_format=info_format,
9467            )
9468        else:
9469            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
9471    def calculation_transcripts_prioritization(self) -> None:
9472        """
9473        The function `calculation_transcripts_prioritization` creates a transcripts table and
9474        prioritizes transcripts based on certain criteria.
9475        """
9476
9477        # Create transcripts table
9478        transcripts_table = self.create_transcript_view()
9479
9480        # Add info field
9481        if transcripts_table:
9482            self.transcripts_prioritization(transcripts_table=transcripts_table)
9483        else:
9484            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
9490    def transcripts_prioritization(
9491        self, transcripts_table: str = None, param: dict = {}
9492    ) -> bool:
9493        """
9494        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
9495        and updates the variants table with the prioritized information.
9496
9497        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
9498        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
9499        This parameter is used to identify the table where the transcripts data is stored for the
9500        prioritization process
9501        :type transcripts_table: str
9502        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
9503        that contains various configuration settings for the prioritization process of transcripts. It
9504        is used to customize the behavior of the prioritization algorithm and includes settings such as
9505        the prefix for prioritization fields, default profiles, and other
9506        :type param: dict
9507        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
9508        transcripts prioritization process is successfully completed, and `False` if there are any
9509        issues or if no profile is defined for transcripts prioritization.
9510        """
9511
9512        log.debug("Start transcripts prioritization...")
9513
9514        # Param
9515        if not param:
9516            param = self.get_param()
9517
9518        # Variants table
9519        table_variants = self.get_table_variants()
9520        log.debug(f"transcripts_table={transcripts_table}")
9521        # Transcripts table
9522        if transcripts_table is None:
9523            log.debug(f"transcripts_table={transcripts_table}")
9524            transcripts_table = self.create_transcript_view(
9525                transcripts_table="transcripts", param=param
9526            )
9527            log.debug(f"transcripts_table={transcripts_table}")
9528        if transcripts_table is None:
9529            msg_err = "No Transcripts table availalble"
9530            log.error(msg_err)
9531            raise ValueError(msg_err)
9532
9533        # Get transcripts columns
9534        columns_as_list_query = f"""
9535            DESCRIBE {transcripts_table}
9536        """
9537        columns_as_list = list(
9538            self.get_query_to_df(columns_as_list_query)["column_name"]
9539        )
9540
9541        # Create INFO if not exists
9542        if "INFO" not in columns_as_list:
9543            query_add_info = f"""
9544                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
9545            """
9546            self.execute_query(query_add_info)
9547
9548        # Prioritization param and Force only PZ Score and Flag
9549        pz_param = param.get("transcripts", {}).get("prioritization", {})
9550        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
9551        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
9552        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
9553        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
9554        pz_profile_default = (
9555            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
9556        )
9557
9558        # Exit if no profile
9559        if pz_profile_default is None:
9560            log.warning("No profile defined for transcripts prioritization")
9561            return False
9562
9563        # Prioritization
9564        prioritization_result = self.prioritization(
9565            table=transcripts_table,
9566            pz_param=param.get("transcripts", {}).get("prioritization", {}),
9567        )
9568        if not prioritization_result:
9569            log.warning("Transcripts prioritization not processed")
9570            return False
9571
9572        # Explode PZ fields
9573        self.explode_infos(
9574            table=transcripts_table,
9575            fields=param.get("transcripts", {})
9576            .get("prioritization", {})
9577            .get("pzfields", []),
9578        )
9579
9580        # Export Transcripts prioritization infos to variants table
9581        query_update = f"""
9582            WITH RankedTranscripts AS (
9583                SELECT
9584                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
9585                    ROW_NUMBER() OVER (
9586                        PARTITION BY "#CHROM", POS, REF, ALT
9587                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
9588                    ) AS rn
9589                FROM
9590                    {transcripts_table}
9591            )
9592            UPDATE {table_variants}
9593                SET
9594                INFO = CONCAT(CASE
9595                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9596                            THEN ''
9597                            ELSE concat("INFO", ';')
9598                        END,
9599                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
9600                        )
9601            FROM
9602                RankedTranscripts
9603            WHERE
9604                rn = 1
9605                AND variants."#CHROM" = RankedTranscripts."#CHROM"
9606                AND variants."POS" = RankedTranscripts."POS"
9607                AND variants."REF" = RankedTranscripts."REF"
9608                AND variants."ALT" = RankedTranscripts."ALT"
9609                
9610        """
9611        self.execute_query(query=query_update)
9612
9613        # Add PZ Transcript in header
9614        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
9615            pz_fields_transcripts,
9616            ".",
9617            "String",
9618            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
9619            "unknown",
9620            "unknown",
9621            code_type_map["String"],
9622        )
9623
9624        # Return
9625        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9627    def create_transcript_view_from_columns_map(
9628        self,
9629        transcripts_table: str = "transcripts",
9630        columns_maps: dict = {},
9631        added_columns: list = [],
9632        temporary_tables: list = None,
9633        annotation_fields: list = None,
9634    ) -> tuple[list, list, list]:
9635        """
9636        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9637        specified columns mapping for transcripts data.
9638
9639        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9640        the table where the transcripts data is stored or will be stored in the database. This table
9641        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9642        predictions, etc. It defaults to "transcripts, defaults to transcripts
9643        :type transcripts_table: str (optional)
9644        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9645        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9646        represents a mapping configuration for a specific set of columns. It typically includes details such
9647        as the main transcript column and additional information columns
9648        :type columns_maps: dict
9649        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9650        function is a list that stores the additional columns that will be added to the view being created
9651        based on the columns map provided. These columns are generated by exploding the transcript
9652        information columns along with the main transcript column
9653        :type added_columns: list
9654        :param temporary_tables: The `temporary_tables` parameter in the
9655        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9656        tables created during the process of creating a transcript view from a columns map. These temporary
9657        tables are used to store intermediate results or transformations before the final view is generated
9658        :type temporary_tables: list
9659        :param annotation_fields: The `annotation_fields` parameter in the
9660        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9661        for annotation in the query view creation process. These fields are extracted from the
9662        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9663        :type annotation_fields: list
9664        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9665        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9666        """
9667
9668        log.debug("Start transcrpts view creation from columns map...")
9669
9670        # "from_columns_map": [
9671        #     {
9672        #         "transcripts_column": "Ensembl_transcriptid",
9673        #         "transcripts_infos_columns": [
9674        #             "genename",
9675        #             "Ensembl_geneid",
9676        #             "LIST_S2_score",
9677        #             "LIST_S2_pred",
9678        #         ],
9679        #     },
9680        #     {
9681        #         "transcripts_column": "Ensembl_transcriptid",
9682        #         "transcripts_infos_columns": [
9683        #             "genename",
9684        #             "VARITY_R_score",
9685        #             "Aloft_pred",
9686        #         ],
9687        #     },
9688        # ],
9689
9690        # Init
9691        if temporary_tables is None:
9692            temporary_tables = []
9693        if annotation_fields is None:
9694            annotation_fields = []
9695
9696        # Variants table
9697        table_variants = self.get_table_variants()
9698
9699        for columns_map in columns_maps:
9700
9701            # Transcript column
9702            transcripts_column = columns_map.get("transcripts_column", None)
9703
9704            # Transcripts infos columns
9705            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9706
9707            if transcripts_column is not None:
9708
9709                # Explode
9710                added_columns += self.explode_infos(
9711                    fields=[transcripts_column] + transcripts_infos_columns
9712                )
9713
9714                # View clauses
9715                clause_select = []
9716                for field in [transcripts_column] + transcripts_infos_columns:
9717                    clause_select.append(
9718                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9719                    )
9720                    if field not in [transcripts_column]:
9721                        annotation_fields.append(field)
9722
9723                # Querey View
9724                query = f""" 
9725                    SELECT
9726                        "#CHROM", POS, REF, ALT, INFO,
9727                        "{transcripts_column}" AS 'transcript',
9728                        {", ".join(clause_select)}
9729                    FROM (
9730                        SELECT 
9731                            "#CHROM", POS, REF, ALT, INFO,
9732                            {", ".join(clause_select)}
9733                        FROM {table_variants}
9734                        )
9735                    WHERE "{transcripts_column}" IS NOT NULL
9736                """
9737
9738                # Create temporary table
9739                temporary_table = transcripts_table + "".join(
9740                    random.choices(string.ascii_uppercase + string.digits, k=10)
9741                )
9742
9743                # Temporary_tables
9744                temporary_tables.append(temporary_table)
9745                query_view = f"""
9746                    CREATE TEMPORARY TABLE {temporary_table}
9747                    AS ({query})
9748                """
9749                self.execute_query(query=query_view)
9750
9751        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
Returns

The function create_transcript_view_from_columns_map returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9753    def create_transcript_view_from_column_format(
9754        self,
9755        transcripts_table: str = "transcripts",
9756        column_formats: dict = {},
9757        temporary_tables: list = None,
9758        annotation_fields: list = None,
9759    ) -> tuple[list, list, list]:
9760        """
9761        The `create_transcript_view_from_column_format` function generates a transcript view based on
9762        specified column formats, adds additional columns and annotation fields, and returns the list of
9763        temporary tables and annotation fields.
9764
9765        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9766        the table containing the transcripts data. This table will be used as the base table for creating
9767        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9768        different table name if needed, defaults to transcripts
9769        :type transcripts_table: str (optional)
9770        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9771        about the columns to be used for creating the transcript view. Each entry in the dictionary
9772        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9773        the provided code snippet:
9774        :type column_formats: dict
9775        :param temporary_tables: The `temporary_tables` parameter in the
9776        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9777        views created during the process of creating a transcript view from a column format. These temporary
9778        views are used to manipulate and extract data before generating the final transcript view. It
9779        :type temporary_tables: list
9780        :param annotation_fields: The `annotation_fields` parameter in the
9781        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9782        that are extracted from the temporary views created during the process. These annotation fields are
9783        obtained by querying the temporary views and extracting the column names excluding specific columns
9784        like `#CH
9785        :type annotation_fields: list
9786        :return: The `create_transcript_view_from_column_format` function returns two lists:
9787        `temporary_tables` and `annotation_fields`.
9788        """
9789
9790        log.debug("Start transcrpts view creation from column format...")
9791
9792        #  "from_column_format": [
9793        #     {
9794        #         "transcripts_column": "ANN",
9795        #         "transcripts_infos_column": "Feature_ID",
9796        #     }
9797        # ],
9798
9799        # Init
9800        if temporary_tables is None:
9801            temporary_tables = []
9802        if annotation_fields is None:
9803            annotation_fields = []
9804
9805        for column_format in column_formats:
9806
9807            # annotation field and transcript annotation field
9808            annotation_field = column_format.get("transcripts_column", "ANN")
9809            transcript_annotation = column_format.get(
9810                "transcripts_infos_column", "Feature_ID"
9811            )
9812
9813            # Temporary View name
9814            temporary_view_name = transcripts_table + "".join(
9815                random.choices(string.ascii_uppercase + string.digits, k=10)
9816            )
9817
9818            # Create temporary view name
9819            temporary_view_name = self.annotation_format_to_table(
9820                uniquify=True,
9821                annotation_field=annotation_field,
9822                view_name=temporary_view_name,
9823                annotation_id=transcript_annotation,
9824            )
9825
9826            # Annotation fields
9827            if temporary_view_name:
9828                query_annotation_fields = f"""
9829                    SELECT *
9830                    FROM (
9831                        DESCRIBE SELECT *
9832                        FROM {temporary_view_name}
9833                        )
9834                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9835                """
9836                df_annotation_fields = self.get_query_to_df(
9837                    query=query_annotation_fields
9838                )
9839
9840                # Add temporary view and annotation fields
9841                temporary_tables.append(temporary_view_name)
9842                annotation_fields += list(set(df_annotation_fields["column_name"]))
9843
9844        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet:
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
9846    def create_transcript_view(
9847        self,
9848        transcripts_table: str = None,
9849        transcripts_table_drop: bool = True,
9850        param: dict = {},
9851    ) -> str:
9852        """
9853        The `create_transcript_view` function generates a transcript view by processing data from a
9854        specified table based on provided parameters and structural information.
9855
9856        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
9857        is used to specify the name of the table that will store the final transcript view data. If a table
9858        name is not provided, the function will create a new table to store the transcript view data, and by
9859        default,, defaults to transcripts
9860        :type transcripts_table: str (optional)
9861        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
9862        `create_transcript_view` function is a boolean parameter that determines whether to drop the
9863        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
9864        the function will drop the existing transcripts table if it exists, defaults to True
9865        :type transcripts_table_drop: bool (optional)
9866        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
9867        contains information needed to create a transcript view. It includes details such as the structure
9868        of the transcripts, columns mapping, column formats, and other necessary information for generating
9869        the view. This parameter allows for flexibility and customization
9870        :type param: dict
9871        :return: The `create_transcript_view` function returns the name of the transcripts table that was
9872        created or modified during the execution of the function.
9873        """
9874
9875        log.debug("Start transcripts view creation...")
9876
9877        # Default
9878        transcripts_table_default = "transcripts"
9879
9880        # Param
9881        if not param:
9882            param = self.get_param()
9883
9884        # Struct
9885        struct = param.get("transcripts", {}).get("struct", None)
9886
9887        if struct:
9888
9889            # Transcripts table
9890            if transcripts_table is None:
9891                transcripts_table = param.get("transcripts", {}).get(
9892                    "table", transcripts_table_default
9893                )
9894
9895            # added_columns
9896            added_columns = []
9897
9898            # Temporary tables
9899            temporary_tables = []
9900
9901            # Annotation fields
9902            annotation_fields = []
9903
9904            # from columns map
9905            columns_maps = struct.get("from_columns_map", [])
9906            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
9907                self.create_transcript_view_from_columns_map(
9908                    transcripts_table=transcripts_table,
9909                    columns_maps=columns_maps,
9910                    added_columns=added_columns,
9911                    temporary_tables=temporary_tables,
9912                    annotation_fields=annotation_fields,
9913                )
9914            )
9915            added_columns += added_columns_tmp
9916            temporary_tables += temporary_tables_tmp
9917            annotation_fields += annotation_fields_tmp
9918
9919            # from column format
9920            column_formats = struct.get("from_column_format", [])
9921            temporary_tables_tmp, annotation_fields_tmp = (
9922                self.create_transcript_view_from_column_format(
9923                    transcripts_table=transcripts_table,
9924                    column_formats=column_formats,
9925                    temporary_tables=temporary_tables,
9926                    annotation_fields=annotation_fields,
9927                )
9928            )
9929            temporary_tables += temporary_tables_tmp
9930            annotation_fields += annotation_fields_tmp
9931
9932            # Merge temporary tables query
9933            query_merge = ""
9934            for temporary_table in temporary_tables:
9935
9936                # First temporary table
9937                if not query_merge:
9938                    query_merge = f"""
9939                        SELECT * FROM {temporary_table}
9940                    """
9941                # other temporary table (using UNION)
9942                else:
9943                    query_merge += f"""
9944                        UNION BY NAME SELECT * FROM {temporary_table}
9945                    """
9946
9947            # Merge on transcript
9948            query_merge_on_transcripts_annotation_fields = []
9949            # Aggregate all annotations fields
9950            for annotation_field in set(annotation_fields):
9951                query_merge_on_transcripts_annotation_fields.append(
9952                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
9953                )
9954            # Query for transcripts view
9955            query_merge_on_transcripts = f"""
9956                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
9957                FROM ({query_merge})
9958                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
9959            """
9960
9961            # Drop transcript view is necessary
9962            if transcripts_table_drop:
9963                query_drop = f"""
9964                    DROP TABLE IF EXISTS {transcripts_table};
9965                """
9966                self.execute_query(query=query_drop)
9967
9968            # Merge and create transcript view
9969            query_create_view = f"""
9970                CREATE TABLE IF NOT EXISTS {transcripts_table}
9971                AS {query_merge_on_transcripts}
9972            """
9973            self.execute_query(query=query_create_view)
9974
9975            # Remove added columns
9976            for added_column in added_columns:
9977                self.drop_column(column=added_column)
9978
9979        else:
9980
9981            transcripts_table = None
9982
9983        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts') -> str:
 9985    def annotation_format_to_table(
 9986        self,
 9987        uniquify: bool = True,
 9988        annotation_field: str = "ANN",
 9989        annotation_id: str = "Feature_ID",
 9990        view_name: str = "transcripts",
 9991    ) -> str:
 9992        """
 9993        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9994        table format.
 9995
 9996        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9997        values in the output or not. If set to `True`, the function will make sure that the output values
 9998        are unique, defaults to True
 9999        :type uniquify: bool (optional)
10000        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10001        contains the annotation information for each variant. This field is used to extract the annotation
10002        details for further processing in the function, defaults to ANN
10003        :type annotation_field: str (optional)
10004        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10005        used to specify the identifier for the annotation feature. This identifier will be used as a column
10006        name in the resulting table or view that is created based on the annotation data. It helps in
10007        uniquely identifying each annotation entry in the, defaults to Feature_ID
10008        :type annotation_id: str (optional)
10009        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10010        specify the name of the temporary table that will be created to store the transformed annotation
10011        data. This table will hold the extracted information from the annotation field in a structured
10012        format for further processing or analysis, defaults to transcripts
10013        :type view_name: str (optional)
10014        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10015        is stored in the variable `view_name`.
10016        """
10017
10018        # Annotation field
10019        annotation_format = "annotation_explode"
10020
10021        # Transcript annotation
10022        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10023
10024        # Prefix
10025        prefix = self.get_explode_infos_prefix()
10026        if prefix:
10027            prefix = "INFO/"
10028
10029        # Annotation fields
10030        annotation_infos = prefix + annotation_field
10031        annotation_format_infos = prefix + annotation_format
10032
10033        # Variants table
10034        table_variants = self.get_table_variants()
10035
10036        # Header
10037        vcf_reader = self.get_header()
10038
10039        # Add columns
10040        added_columns = []
10041
10042        # Explode HGVS field in column
10043        added_columns += self.explode_infos(fields=[annotation_field])
10044
10045        if annotation_field in vcf_reader.infos:
10046
10047            # Extract ANN header
10048            ann_description = vcf_reader.infos[annotation_field].desc
10049            pattern = r"'(.+?)'"
10050            match = re.search(pattern, ann_description)
10051            if match:
10052                ann_header_match = match.group(1).split(" | ")
10053                ann_header = []
10054                ann_header_desc = {}
10055                for i in range(len(ann_header_match)):
10056                    ann_header_info = "".join(
10057                        char for char in ann_header_match[i] if char.isalnum()
10058                    )
10059                    ann_header.append(ann_header_info)
10060                    ann_header_desc[ann_header_info] = ann_header_match[i]
10061                if not ann_header_desc:
10062                    raise ValueError("Invalid header description format")
10063            else:
10064                raise ValueError("Invalid header description format")
10065
10066            # Create variant id
10067            variant_id_column = self.get_variant_id_column()
10068            added_columns += [variant_id_column]
10069
10070            # Create dataframe
10071            dataframe_annotation_format = self.get_query_to_df(
10072                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10073            )
10074
10075            # Create annotation columns
10076            dataframe_annotation_format[
10077                annotation_format_infos
10078            ] = dataframe_annotation_format[annotation_infos].apply(
10079                lambda x: explode_annotation_format(
10080                    annotation=str(x),
10081                    uniquify=uniquify,
10082                    output_format="JSON",
10083                    prefix="",
10084                    header=list(ann_header_desc.values()),
10085                )
10086            )
10087
10088            # Find keys
10089            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10090            df_keys = self.get_query_to_df(query=query_json)
10091
10092            # Check keys
10093            query_json_key = []
10094            for _, row in df_keys.iterrows():
10095
10096                # Key
10097                key = row.iloc[0]
10098
10099                # key_clean
10100                key_clean = "".join(char for char in key if char.isalnum())
10101
10102                # Type
10103                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10104
10105                # Get DataFrame from query
10106                df_json_type = self.get_query_to_df(query=query_json_type)
10107
10108                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10109                with pd.option_context("future.no_silent_downcasting", True):
10110                    df_json_type.fillna(value="", inplace=True)
10111                    replace_dict = {None: np.nan, "": np.nan}
10112                    df_json_type.replace(replace_dict, inplace=True)
10113                    df_json_type.dropna(inplace=True)
10114
10115                # Detect column type
10116                column_type = detect_column_type(df_json_type[key_clean])
10117
10118                # Append
10119                query_json_key.append(
10120                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10121                )
10122
10123            # Create view
10124            query_view = f"""
10125                CREATE TEMPORARY TABLE {view_name}
10126                AS (
10127                    SELECT *, {annotation_id} AS 'transcript'
10128                    FROM (
10129                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10130                        FROM dataframe_annotation_format
10131                        )
10132                    );
10133            """
10134            self.execute_query(query=query_view)
10135
10136        else:
10137
10138            # Return None
10139            view_name = None
10140
10141        # Remove added columns
10142        for added_column in added_columns:
10143            self.drop_column(column=added_column)
10144
10145        return view_name

The function annotation_format_to_table converts annotation data from a VCF file into a structured table format.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
10147    def transcript_view_to_variants(
10148        self,
10149        transcripts_table: str = None,
10150        transcripts_column_id: str = None,
10151        transcripts_info_json: str = None,
10152        transcripts_info_field_json: str = None,
10153        transcripts_info_format: str = None,
10154        transcripts_info_field_format: str = None,
10155        param: dict = {},
10156    ) -> bool:
10157        """
10158        The `transcript_view_to_variants` function updates a variants table with information from
10159        transcripts in JSON format.
10160
10161        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10162        table containing the transcripts data. If this parameter is not provided, the function will
10163        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10164        :type transcripts_table: str
10165        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10166        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10167        identifier is used to match transcripts with variants in the database
10168        :type transcripts_column_id: str
10169        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10170        of the column in the variants table where the transcripts information will be stored in JSON
10171        format. This parameter allows you to define the column in the variants table that will hold the
10172        JSON-formatted information about transcripts
10173        :type transcripts_info_json: str
10174        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10175        specify the field in the VCF header that will contain information about transcripts in JSON
10176        format. This field will be added to the VCF header as an INFO field with the specified name
10177        :type transcripts_info_field_json: str
10178        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10179        format of the information about transcripts that will be stored in the variants table. This
10180        format can be used to define how the transcript information will be structured or displayed
10181        within the variants table
10182        :type transcripts_info_format: str
10183        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10184        specify the field in the VCF header that will contain information about transcripts in a
10185        specific format. This field will be added to the VCF header as an INFO field with the specified
10186        name
10187        :type transcripts_info_field_format: str
10188        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10189        that contains various configuration settings related to transcripts. It is used to provide
10190        default values for certain parameters if they are not explicitly provided when calling the
10191        method. The `param` dictionary can be passed as an argument
10192        :type param: dict
10193        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10194        if the operation is successful and `False` if certain conditions are not met.
10195        """
10196
10197        msg_info_prefix = "Start transcripts view to variants annotations"
10198
10199        log.debug(f"{msg_info_prefix}...")
10200
10201        # Default
10202        transcripts_table_default = "transcripts"
10203        transcripts_column_id_default = "transcript"
10204        transcripts_info_json_default = None
10205        transcripts_info_format_default = None
10206        transcripts_info_field_json_default = None
10207        transcripts_info_field_format_default = None
10208
10209        # Param
10210        if not param:
10211            param = self.get_param()
10212
10213        # Transcripts table
10214        if transcripts_table is None:
10215            transcripts_table = param.get("transcripts", {}).get(
10216                "table", transcripts_table_default
10217            )
10218
10219        # Transcripts column ID
10220        if transcripts_column_id is None:
10221            transcripts_column_id = param.get("transcripts", {}).get(
10222                "column_id", transcripts_column_id_default
10223            )
10224
10225        # Transcripts info json
10226        if transcripts_info_json is None:
10227            transcripts_info_json = param.get("transcripts", {}).get(
10228                "transcripts_info_json", transcripts_info_json_default
10229            )
10230
10231        # Transcripts info field JSON
10232        if transcripts_info_field_json is None:
10233            transcripts_info_field_json = param.get("transcripts", {}).get(
10234                "transcripts_info_field_json", transcripts_info_field_json_default
10235            )
10236        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10237        #     transcripts_info_json = transcripts_info_field_json
10238
10239        # Transcripts info format
10240        if transcripts_info_format is None:
10241            transcripts_info_format = param.get("transcripts", {}).get(
10242                "transcripts_info_format", transcripts_info_format_default
10243            )
10244
10245        # Transcripts info field FORMAT
10246        if transcripts_info_field_format is None:
10247            transcripts_info_field_format = param.get("transcripts", {}).get(
10248                "transcripts_info_field_format", transcripts_info_field_format_default
10249            )
10250        # if (
10251        #     transcripts_info_field_format is not None
10252        #     and transcripts_info_format is None
10253        # ):
10254        #     transcripts_info_format = transcripts_info_field_format
10255
10256        # Variants table
10257        table_variants = self.get_table_variants()
10258
10259        # Check info columns param
10260        if (
10261            transcripts_info_json is None
10262            and transcripts_info_field_json is None
10263            and transcripts_info_format is None
10264            and transcripts_info_field_format is None
10265        ):
10266            return False
10267
10268        # Transcripts infos columns
10269        query_transcripts_infos_columns = f"""
10270            SELECT *
10271            FROM (
10272                DESCRIBE SELECT * FROM {transcripts_table}
10273                )
10274            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10275        """
10276        transcripts_infos_columns = list(
10277            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10278        )
10279
10280        # View results
10281        clause_select = []
10282        clause_to_json = []
10283        clause_to_format = []
10284        for field in transcripts_infos_columns:
10285            clause_select.append(
10286                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10287            )
10288            clause_to_json.append(f""" '{field}': "{field}" """)
10289            clause_to_format.append(f""" "{field}" """)
10290
10291        # Update
10292        update_set_json = []
10293        update_set_format = []
10294
10295        # VCF header
10296        vcf_reader = self.get_header()
10297
10298        # Transcripts to info column in JSON
10299        if transcripts_info_json is not None:
10300
10301            # Create column on variants table
10302            self.add_column(
10303                table_name=table_variants,
10304                column_name=transcripts_info_json,
10305                column_type="JSON",
10306                default_value=None,
10307                drop=False,
10308            )
10309
10310            # Add header
10311            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10312                transcripts_info_json,
10313                ".",
10314                "String",
10315                "Transcripts in JSON format",
10316                "unknwon",
10317                "unknwon",
10318                self.code_type_map["String"],
10319            )
10320
10321            # Add to update
10322            update_set_json.append(
10323                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10324            )
10325
10326        # Transcripts to info field in JSON
10327        if transcripts_info_field_json is not None:
10328
10329            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10330
10331            # Add to update
10332            update_set_json.append(
10333                f""" 
10334                    INFO = concat(
10335                            CASE
10336                                WHEN INFO NOT IN ('', '.')
10337                                THEN INFO
10338                                ELSE ''
10339                            END,
10340                            CASE
10341                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10342                                THEN concat(
10343                                    ';{transcripts_info_field_json}=',
10344                                    t.{transcripts_info_json}
10345                                )
10346                                ELSE ''
10347                            END
10348                            )
10349                """
10350            )
10351
10352            # Add header
10353            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10354                transcripts_info_field_json,
10355                ".",
10356                "String",
10357                "Transcripts in JSON format",
10358                "unknwon",
10359                "unknwon",
10360                self.code_type_map["String"],
10361            )
10362
10363        if update_set_json:
10364
10365            # Update query
10366            query_update = f"""
10367                UPDATE {table_variants}
10368                    SET {", ".join(update_set_json)}
10369                FROM
10370                (
10371                    SELECT
10372                        "#CHROM", POS, REF, ALT,
10373                            concat(
10374                            '{{',
10375                            string_agg(
10376                                '"' || "{transcripts_column_id}" || '":' ||
10377                                to_json(json_output)
10378                            ),
10379                            '}}'
10380                            )::JSON AS {transcripts_info_json}
10381                    FROM
10382                        (
10383                        SELECT
10384                            "#CHROM", POS, REF, ALT,
10385                            "{transcripts_column_id}",
10386                            to_json(
10387                                {{{",".join(clause_to_json)}}}
10388                            )::JSON AS json_output
10389                        FROM
10390                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10391                        WHERE "{transcripts_column_id}" IS NOT NULL
10392                        )
10393                    GROUP BY "#CHROM", POS, REF, ALT
10394                ) AS t
10395                WHERE {table_variants}."#CHROM" = t."#CHROM"
10396                    AND {table_variants}."POS" = t."POS"
10397                    AND {table_variants}."REF" = t."REF"
10398                    AND {table_variants}."ALT" = t."ALT"
10399            """
10400
10401            self.execute_query(query=query_update)
10402
10403        # Transcripts to info column in FORMAT
10404        if transcripts_info_format is not None:
10405
10406            # Create column on variants table
10407            self.add_column(
10408                table_name=table_variants,
10409                column_name=transcripts_info_format,
10410                column_type="VARCHAR",
10411                default_value=None,
10412                drop=False,
10413            )
10414
10415            # Add header
10416            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10417                transcripts_info_format,
10418                ".",
10419                "String",
10420                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10421                "unknwon",
10422                "unknwon",
10423                self.code_type_map["String"],
10424            )
10425
10426            # Add to update
10427            update_set_format.append(
10428                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10429            )
10430
10431        # Transcripts to info field in JSON
10432        if transcripts_info_field_format is not None:
10433
10434            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10435
10436            # Add to update
10437            update_set_format.append(
10438                f""" 
10439                    INFO = concat(
10440                            CASE
10441                                WHEN INFO NOT IN ('', '.')
10442                                THEN INFO
10443                                ELSE ''
10444                            END,
10445                            CASE
10446                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10447                                THEN concat(
10448                                    ';{transcripts_info_field_format}=',
10449                                    t.{transcripts_info_format}
10450                                )
10451                                ELSE ''
10452                            END
10453                            )
10454                """
10455            )
10456
10457            # Add header
10458            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10459                transcripts_info_field_format,
10460                ".",
10461                "String",
10462                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10463                "unknwon",
10464                "unknwon",
10465                self.code_type_map["String"],
10466            )
10467
10468        if update_set_format:
10469
10470            # Update query
10471            query_update = f"""
10472                UPDATE {table_variants}
10473                    SET {", ".join(update_set_format)}
10474                FROM
10475                (
10476                    SELECT
10477                        "#CHROM", POS, REF, ALT,
10478                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10479                    FROM 
10480                        (
10481                        SELECT
10482                            "#CHROM", POS, REF, ALT,
10483                            "{transcripts_column_id}",
10484                            concat(
10485                                "{transcripts_column_id}",
10486                                '|',
10487                                {", '|', ".join(clause_to_format)}
10488                            ) AS {transcripts_info_format}
10489                        FROM
10490                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10491                        )
10492                    GROUP BY "#CHROM", POS, REF, ALT
10493                ) AS t
10494                WHERE {table_variants}."#CHROM" = t."#CHROM"
10495                    AND {table_variants}."POS" = t."POS"
10496                    AND {table_variants}."REF" = t."REF"
10497                    AND {table_variants}."ALT" = t."ALT"
10498            """
10499
10500            self.execute_query(query=query_update)
10501
10502        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.